public override BitArray Bits(IndexReader reader) { // reader.GetVersion could be used to cache // Debug.WriteLine(reader.GetVersion()); // could be used to cache // if (cached reader == reader && _revFirst == if (_revFirst == All || _revLast == All) // optimization { return(new BitArray(reader.MaxDoc(), true)); } BitArray last_bits = new BitArray(reader.MaxDoc(), false); TermEnum t = reader.Terms(new Term(FieldName.RevisionLast, _revFirst.ToString(RevFormat))); TermDocs d = reader.TermDocs(); //if (t.SkipTo((new Term(FieldName.RevisionLast, revision.ToString(RevFormat))))) // extremely slow if (t.Term() != null) { while (t.Term().Field() == FieldName.RevisionLast) { d.Seek(t); while (d.Next()) { last_bits[d.Doc()] = true; } if (!t.Next()) { break; } } } // optimization, skip if we just using the head revision if (_revLast == Head) { return(last_bits); } BitArray first_bits = new BitArray(reader.MaxDoc(), true); t = reader.Terms(new Term("rev_first", (_revLast + 1).ToString(RevFormat))); //if (t.SkipTo((new Term("rev_first", (revision + 1).ToString(RevFormat))))) // extremely slow if (t.Term() != null) { while (t.Term().Field() == "rev_first") { d.Seek(t); while (d.Next()) { first_bits[d.Doc()] = false; } if (!t.Next()) { break; } } } return(last_bits.And(first_bits)); }
public virtual void TestTerms() { TermEnum terms = _reader.Terms(); Assert.IsTrue(terms != null); while (terms.Next() == true) { Term term = terms.Term; Assert.IsTrue(term != null); //System.out.println("Term: " + term); System.String fieldValue = (System.String)DocHelper.NameValues[term.Field]; Assert.IsTrue(fieldValue.IndexOf(term.Text) != -1); } TermDocs termDocs = _reader.TermDocs(); Assert.IsTrue(termDocs != null); termDocs.Seek(new Term(DocHelper.TextField1Key, "field")); Assert.IsTrue(termDocs.Next() == true); termDocs.Seek(new Term(DocHelper.NoNormsKey, DocHelper.NoNormsText)); Assert.IsTrue(termDocs.Next() == true); TermPositions positions = _reader.TermPositions(); positions.Seek(new Term(DocHelper.TextField1Key, "field")); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Doc == 0); Assert.IsTrue(positions.NextPosition() >= 0); }
public virtual void TestTerms() { TermEnum terms = reader.Terms(); Assert.IsTrue(terms != null); while (terms.Next() == true) { Term term = terms.Term(); Assert.IsTrue(term != null); //System.out.println("Term: " + term); System.String fieldValue = (System.String)DocHelper.nameValues[term.Field()]; Assert.IsTrue(fieldValue.IndexOf(term.Text()) != -1); } TermDocs termDocs = reader.TermDocs(); Assert.IsTrue(termDocs != null); termDocs.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); Assert.IsTrue(termDocs.Next() == true); termDocs.Seek(new Term(DocHelper.NO_NORMS_KEY, DocHelper.NO_NORMS_TEXT)); Assert.IsTrue(termDocs.Next() == true); TermPositions positions = reader.TermPositions(); positions.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field")); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Doc() == 0); Assert.IsTrue(positions.NextPosition() >= 0); }
protected internal override System.Object CreateValue(IndexReader reader, Entry entryKey) { System.String field = StringHelper.Intern(entryKey.field); System.String[] retArray = new System.String[reader.MaxDoc]; TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field)); try { do { Term term = termEnum.Term; if (term == null || (System.Object)term.Field != (System.Object)field) { break; } System.String termval = term.Text; termDocs.Seek(termEnum); while (termDocs.Next()) { retArray[termDocs.Doc] = termval; } }while (termEnum.Next()); } finally { termDocs.Close(); termEnum.Close(); } return(retArray); }
private static Hit CreateHit(Document primary_doc, IndexReader secondary_reader, TermDocs term_docs, FieldSelector fields) { Hit hit = DocumentToHit(primary_doc); if (secondary_reader == null) { return(hit); } // Get the stringified version of the URI // exactly as it comes out of the index. Term term = new Term("Uri", primary_doc.Get("Uri")); term_docs.Seek(term); // Move to the first (and only) matching term doc term_docs.Next(); Document secondary_doc = (fields == null) ? secondary_reader.Document(term_docs.Doc()) : secondary_reader.Document(term_docs.Doc(), fields); // If we are using the secondary index, now we need to // merge the properties from the secondary index AddPropertiesToHit(hit, secondary_doc, false); return(hit); }
public virtual void TestTerms() { try { TermEnum terms = reader.Terms(); Assert.IsTrue(terms != null); while (terms.Next() == true) { Term term = terms.Term(); Assert.IsTrue(term != null); //System.out.println("Term: " + term); System.String fieldValue = (System.String)DocHelper.nameValues[term.Field()]; Assert.IsTrue(fieldValue.IndexOf(term.Text()) != -1); } TermDocs termDocs = reader.TermDocs(); Assert.IsTrue(termDocs != null); termDocs.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "Field")); Assert.IsTrue(termDocs.Next() == true); TermPositions positions = reader.TermPositions(); positions.Seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "Field")); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Doc() == 0); Assert.IsTrue(positions.NextPosition() >= 0); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public virtual int doTest(int iter, int ndocs, int maxTF, float percentDocs) { Directory dir = new RAMDirectory(); long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); AddDocs(dir, ndocs, "foo", "val", maxTF, percentDocs); long end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); IndexReader reader = IndexReader.Open(dir); TermEnum tenum = reader.Terms(new Term("foo", "val")); TermDocs tdocs = reader.TermDocs(); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); int ret = 0; for (int i = 0; i < iter; i++) { tdocs.Seek(tenum); while (tdocs.Next()) { ret += tdocs.Doc(); } } end = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); return(ret); }
/// <summary>Returns an enumeration of all the documents which contain /// <code>term</code>. For each document, the document number, the frequency of /// the term in that document is also provided, for use in search scoring. /// Thus, this method implements the mapping: /// <p><ul> /// Term => <docNum, freq><sup>*</sup> /// </ul> /// <p>The enumeration is ordered by document number. Each document number /// is greater than all that precede it in the enumeration. /// </summary> public virtual TermDocs TermDocs(Term term) { TermDocs termDocs = TermDocs(); termDocs.Seek(term); return(termDocs); }
public override DocIdSet GetDocIdSet(IndexReader reader) { TermEnum enumerator = query.GetEnum(reader); try { // if current term in enum is null, the enum is empty -> shortcut if (enumerator.Term == null) { return(DocIdSet.EMPTY_DOCIDSET); } // else fill into an OpenBitSet OpenBitSet bitSet = new OpenBitSet(reader.MaxDoc); int[] docs = new int[32]; int[] freqs = new int[32]; TermDocs termDocs = reader.TermDocs(); try { int termCount = 0; do { Term term = enumerator.Term; if (term == null) { break; } termCount++; termDocs.Seek(term); while (true) { int count = termDocs.Read(docs, freqs); if (count != 0) { for (int i = 0; i < count; i++) { bitSet.Set(docs[i]); } } else { break; } } } while (enumerator.Next()); query.IncTotalNumberOfTerms(termCount); // {{Aroush-2.9}} is the use of 'temp' as is right? } finally { termDocs.Close(); } return(bitSet); } finally { enumerator.Close(); } }
protected internal override System.Object CreateValue(IndexReader reader, Entry entryKey) { Entry entry = entryKey; System.String field = entry.field; FloatParser parser = (FloatParser)entry.custom; if (parser == null) { try { return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_FLOAT_PARSER)); } catch (System.FormatException) { return(wrapper.GetFloats(reader, field, Lucene.Net.Search.FieldCache_Fields.NUMERIC_UTILS_FLOAT_PARSER)); } } float[] retArray = null; TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field)); try { do { Term term = termEnum.Term; if (term == null || (System.Object)term.Field != (System.Object)field) { break; } float termval = parser.ParseFloat(term.Text); if (retArray == null) { // late init retArray = new float[reader.MaxDoc]; } termDocs.Seek(termEnum); while (termDocs.Next()) { retArray[termDocs.Doc] = termval; } }while (termEnum.Next()); } catch (StopFillCacheException) { } finally { termDocs.Close(); termEnum.Close(); } if (retArray == null) { // no values retArray = new float[reader.MaxDoc]; } return(retArray); }
/// <summary>Returns an enumeration of all the documents which contain /// <code>term</code>. For each document, the document number, the frequency of /// the term in that document is also provided, for use in search scoring. /// Thus, this method implements the mapping: /// <p><ul> /// Term => <docNum, freq><sup>*</sup> /// </ul> /// <p>The enumeration is ordered by document number. Each document number /// is greater than all that precede it in the enumeration. /// </summary> /// <throws> IOException if there is a low-level IO error </throws> public virtual TermDocs TermDocs(Term term) { EnsureOpen(); TermDocs termDocs = TermDocs(); termDocs.Seek(term); return(termDocs); }
public virtual void TestMultiTermDocs() { SqlServerDirectory.ProvisionDatabase(Connection, "test1", true); SqlServerDirectory.ProvisionDatabase(Connection, "test2", true); SqlServerDirectory.ProvisionDatabase(Connection, "test3", true); var ramDir1 = new SqlServerDirectory(Connection, new Options() { SchemaName = "test1" }); AddDoc(ramDir1, "test foo", true); var ramDir2 = new SqlServerDirectory(Connection, new Options() { SchemaName = "test2" }); AddDoc(ramDir2, "test blah", true); var ramDir3 = new SqlServerDirectory(Connection, new Options() { SchemaName = "test3" }); AddDoc(ramDir3, "test wow", true); IndexReader[] readers1 = new[] { IndexReader.Open(ramDir1, false), IndexReader.Open(ramDir3, false) }; IndexReader[] readers2 = new[] { IndexReader.Open(ramDir1, false), IndexReader.Open(ramDir2, false), IndexReader.Open(ramDir3, false) }; MultiReader mr2 = new MultiReader(readers1); MultiReader mr3 = new MultiReader(readers2); // test mixing up TermDocs and TermEnums from different readers. TermDocs td2 = mr2.TermDocs(); TermEnum te3 = mr3.Terms(new Term("body", "wow")); td2.Seek(te3); int ret = 0; // This should blow up if we forget to check that the TermEnum is from the same // reader as the TermDocs. while (td2.Next()) { ret += td2.Doc; } td2.Close(); te3.Close(); // really a dummy assert to ensure that we got some docs and to ensure that // nothing is optimized out. Assert.IsTrue(ret > 0); }
private TermDocs TermDocs(int i) { if (term == null) { return(null); } TermDocs result = readerTermDocs[i]; if (result == null) { result = readerTermDocs[i] = TermDocs(readers[i]); } result.Seek(term); return(result); }
// ДОРАБОТКА использовать данную функцию для фильтрации результатов поиска public int GetMatchWordCount(IEnumerable <SampleDataFileRow> listFoundDocs, string searchTerm) { int totalFreq = 0; IndexReader reader = IndexReader.Open(DirectoryFs, true); TermDocs termDocs = reader.TermDocs(); termDocs.Seek(new Term("LineText", searchTerm)); foreach (SampleDataFileRow singleRow in listFoundDocs) { termDocs.SkipTo(singleRow.LineNumber); totalFreq += termDocs.Freq; } return(totalFreq); }
protected internal override System.Object CreateValue(IndexReader reader, Entry entryKey) { Entry entry = entryKey; System.String field = entry.field; ShortParser parser = (ShortParser)entry.custom; if (parser == null) { return(wrapper.GetShorts(reader, field, Lucene.Net.Search.FieldCache_Fields.DEFAULT_SHORT_PARSER)); } short[] retArray = new short[reader.MaxDoc]; TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field)); try { do { Term term = termEnum.Term; if (term == null || (System.Object)term.Field != (System.Object)field) { break; } short termval = parser.ParseShort(term.Text); termDocs.Seek(termEnum); while (termDocs.Next()) { retArray[termDocs.Doc] = termval; } }while (termEnum.Next()); } catch (StopFillCacheException) { } finally { termDocs.Close(); termEnum.Close(); } return(retArray); }
/// <summary> /// Get the DocIdSet. /// </summary> /// <param name="reader">Applcible reader.</param> /// <returns>The set.</returns> public override DocIdSet GetDocIdSet(IndexReader reader) { OpenBitSet result = new OpenBitSet(reader.MaxDoc); TermDocs td = reader.TermDocs(); try { foreach (Term t in this.terms) { td.Seek(t); while (td.Next()) { result.Set(td.Doc); } } } finally { td.Close(); } return(result); }
public virtual void TestMultiTermDocs() { RAMDirectory ramDir1 = new RAMDirectory(); AddDoc(ramDir1, "test foo", true); RAMDirectory ramDir2 = new RAMDirectory(); AddDoc(ramDir2, "test blah", true); RAMDirectory ramDir3 = new RAMDirectory(); AddDoc(ramDir3, "test wow", true); IndexReader[] readers1 = new IndexReader[] { IndexReader.Open(ramDir1), IndexReader.Open(ramDir3) }; IndexReader[] readers2 = new IndexReader[] { IndexReader.Open(ramDir1), IndexReader.Open(ramDir2), IndexReader.Open(ramDir3) }; MultiReader mr2 = new MultiReader(readers1); MultiReader mr3 = new MultiReader(readers2); // test mixing up TermDocs and TermEnums from different readers. TermDocs td2 = mr2.TermDocs(); TermEnum te3 = mr3.Terms(new Term("body", "wow")); td2.Seek(te3); int ret = 0; // This should blow up if we forget to check that the TermEnum is from the same // reader as the TermDocs. while (td2.Next()) { ret += td2.Doc(); } td2.Close(); te3.Close(); // really a dummy assert to ensure that we got some docs and to ensure that // nothing is optimized out. Assert.IsTrue(ret > 0); }
public void FlushUris() { if (pending_uris == null) { return; } TermDocs term_docs = this.searcher.Reader.TermDocs(); for (int i = 0; i < pending_uris.Count; i++) { Term term = new Term("Uri", (string)pending_uris [i]); term_docs.Seek(term); if (term_docs.Next()) { this.Set(term_docs.Doc(), true); } } term_docs.Close(); pending_uris = null; }
public override DocIdSet GetDocIdSet(IndexReader reader) { var bits = new OpenBitSet(reader.MaxDoc()); TermDocs termDocs = reader.TermDocs(); List <double> area = _shape.Area; int sz = area.Count; // iterate through each boxid for (int i = 0; i < sz; i++) { double boxId = area[i]; termDocs.Seek(new Term(_fieldName, NumericUtils.DoubleToPrefixCoded(boxId))); // iterate through all documents // which have this boxId while (termDocs.Next()) { bits.FastSet(termDocs.Doc()); } } return(bits); }
//////////////////////////////////////////////////////////////// static private void ScoreHits(Dictionary <int, Hit> hits_by_id, IndexReader reader, ICollection term_list) { LNS.Similarity similarity; similarity = LNS.Similarity.GetDefault(); TermDocs term_docs = reader.TermDocs(); Hit hit; foreach (Term term in term_list) { double idf; idf = similarity.Idf(reader.DocFreq(term), reader.MaxDoc()); int hit_count; hit_count = hits_by_id.Count; term_docs.Seek(term); while (term_docs.Next() && hit_count > 0) { int id; id = term_docs.Doc(); if (hits_by_id.TryGetValue(id, out hit)) { double tf; tf = similarity.Tf(term_docs.Freq()); hit.Score += tf * idf; --hit_count; } } } term_docs.Close(); }
public virtual void Seek(Term term) { in_Renamed.Seek(term); }
public virtual void Test1() { ParallelReader pr = new ParallelReader(); pr.Add(ir1); pr.Add(ir2); TermDocs td = pr.TermDocs(); TermEnum te = pr.Terms(); Assert.IsTrue(te.Next()); Assert.AreEqual("field1:brown", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field1:fox", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field1:jumps", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field1:quick", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field1:the", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field2:brown", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field2:fox", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field2:jumps", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field2:quick", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field2:the", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field3:dog", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field3:fox", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field3:jumps", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field3:lazy", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field3:over", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsTrue(te.Next()); Assert.AreEqual("field3:the", te.Term().ToString()); td.Seek(te.Term()); Assert.IsTrue(td.Next()); Assert.AreEqual(0, td.Doc()); Assert.IsFalse(td.Next()); Assert.IsFalse(te.Next()); }
public virtual void TestKnownSetOfDocuments() { System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" }; System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader.Terms(); TermDocs termDocs = knownSearcher.reader.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i]) == true) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("Field", "chocolate")); Hits hits = knownSearcher.Search(query); //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length() == 3); float score = hits.Score(0); /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString())); Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString())); Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString())); TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = (System.Int32)test4Map[term]; System.Object tmpFreqInt = test4Map[term]; Assert.IsTrue(tmpFreqInt != null); Assert.IsTrue(freqInt == freq); } knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public virtual void TestKnownSetOfDocuments() { System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new MockRAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms(); TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i])) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("field", "chocolate")); ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).scoreDocs; //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length == 3); float score = hits[0].score; /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(hits[0].doc == 2); Assert.IsTrue(hits[1].doc == 3); Assert.IsTrue(hits[2].doc == 0); TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = -1; try { freqInt = (System.Int32)test4Map[term]; } catch (Exception) { Assert.IsTrue(false); } Assert.IsTrue(freqInt == freq); } SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper); System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet(); Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10); TermVectorEntry last = null; foreach (TermVectorEntry tve in vectorEntrySet.Keys) { if (tve != null && last != null) { Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted"); System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()]; //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:"); } last = tve; } FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper); System.Collections.IDictionary map = fieldMapper.GetFieldToTerms(); Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2); vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"]; Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be"); Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10); knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
private static Hit CreateHit ( Document primary_doc, IndexReader secondary_reader, TermDocs term_docs, FieldSelector fields) { Hit hit = DocumentToHit (primary_doc); if (secondary_reader == null) return hit; // Get the stringified version of the URI // exactly as it comes out of the index. Term term = new Term ("Uri", primary_doc.Get ("Uri")); term_docs.Seek (term); // Move to the first (and only) matching term doc term_docs.Next (); Document secondary_doc = (fields == null) ? secondary_reader.Document (term_docs.Doc ()) : secondary_reader.Document (term_docs.Doc (), fields); // If we are using the secondary index, now we need to // merge the properties from the secondary index AddPropertiesToHit (hit, secondary_doc, false); return hit; }
public static void VerifyEquals(IndexReader r1, IndexReader r2, System.String idField) { Assert.AreEqual(r1.NumDocs(), r2.NumDocs()); bool hasDeletes = !(r1.MaxDoc() == r2.MaxDoc() && r1.NumDocs() == r1.MaxDoc()); int[] r2r1 = new int[r2.MaxDoc()]; // r2 id to r1 id mapping TermDocs termDocs1 = r1.TermDocs(); TermDocs termDocs2 = r2.TermDocs(); // create mapping from id2 space to id2 based on idField idField = StringHelper.Intern(idField); TermEnum termEnum = r1.Terms(new Term(idField, "")); do { Term term = termEnum.Term(); if (term == null || (System.Object)term.Field() != (System.Object)idField) { break; } termDocs1.Seek(termEnum); if (!termDocs1.Next()) { // This doc is deleted and wasn't replaced termDocs2.Seek(termEnum); Assert.IsFalse(termDocs2.Next()); continue; } int id1 = termDocs1.Doc(); Assert.IsFalse(termDocs1.Next()); termDocs2.Seek(termEnum); Assert.IsTrue(termDocs2.Next()); int id2 = termDocs2.Doc(); Assert.IsFalse(termDocs2.Next()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (System.Exception t) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); System.Console.Out.WriteLine(" d1=" + r1.Document(id1)); System.Console.Out.WriteLine(" d2=" + r2.Document(id2)); throw t; } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermFreqVectors(id1), r2.GetTermFreqVectors(id2)); } catch (System.Exception e) { System.Console.Out.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); TermFreqVector[] tv1 = r1.GetTermFreqVectors(id1); System.Console.Out.WriteLine(" d1=" + tv1); if (tv1 != null) { for (int i = 0; i < tv1.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv1[i]); } } TermFreqVector[] tv2 = r2.GetTermFreqVectors(id2); System.Console.Out.WriteLine(" d2=" + tv2); if (tv2 != null) { for (int i = 0; i < tv2.Length; i++) { System.Console.Out.WriteLine(" " + i + ": " + tv2[i]); } } throw e; } }while (termEnum.Next()); termEnum.Close(); // Verify postings TermEnum termEnum1 = r1.Terms(new Term("", "")); TermEnum termEnum2 = r2.Terms(new Term("", "")); // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs()]; long[] info2 = new long[r2.NumDocs()]; for (; ;) { Term term1, term2; // iterate until we get some docs int len1; for (; ;) { len1 = 0; term1 = termEnum1.Term(); if (term1 == null) { break; } termDocs1.Seek(termEnum1); while (termDocs1.Next()) { int d1 = termDocs1.Doc(); int f1 = termDocs1.Freq(); info1[len1] = (((long)d1) << 32) | f1; len1++; } if (len1 > 0) { break; } if (!termEnum1.Next()) { break; } } // iterate until we get some docs int len2; for (; ;) { len2 = 0; term2 = termEnum2.Term(); if (term2 == null) { break; } termDocs2.Seek(termEnum2); while (termDocs2.Next()) { int d2 = termDocs2.Doc(); int f2 = termDocs2.Freq(); info2[len2] = (((long)r2r1[d2]) << 32) | f2; len2++; } if (len2 > 0) { break; } if (!termEnum2.Next()) { break; } } if (!hasDeletes) { Assert.AreEqual(termEnum1.DocFreq(), termEnum2.DocFreq()); } Assert.AreEqual(len1, len2); if (len1 == 0) { break; // no more terms } Assert.AreEqual(term1, term2); // sort info2 to get it into ascending docid System.Array.Sort(info2, 0, len2 - 0); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i]); } termEnum1.Next(); termEnum2.Next(); } }
public virtual void testSkipTo(int indexDivisor) { Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Term ta = new Term("content", "aaa"); for (int i = 0; i < 10; i++) { AddDoc(writer, "aaa aaa aaa aaa"); } Term tb = new Term("content", "bbb"); for (int i = 0; i < 16; i++) { AddDoc(writer, "bbb bbb bbb bbb"); } Term tc = new Term("content", "ccc"); for (int i = 0; i < 50; i++) { AddDoc(writer, "ccc ccc ccc ccc"); } // assure that we deal with a single segment writer.Optimize(); writer.Close(); IndexReader reader = IndexReader.Open(dir, null, true, indexDivisor); TermDocs tdocs = reader.TermDocs(); // without optimization (assumption skipInterval == 16) // with next tdocs.Seek(ta); Assert.IsTrue(tdocs.Next()); Assert.AreEqual(0, tdocs.Doc()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.Next()); Assert.AreEqual(1, tdocs.Doc()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.SkipTo(0)); Assert.AreEqual(2, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(4)); Assert.AreEqual(4, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(9)); Assert.AreEqual(9, tdocs.Doc()); Assert.IsFalse(tdocs.SkipTo(10)); // without next tdocs.Seek(ta); Assert.IsTrue(tdocs.SkipTo(0)); Assert.AreEqual(0, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(4)); Assert.AreEqual(4, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(9)); Assert.AreEqual(9, tdocs.Doc()); Assert.IsFalse(tdocs.SkipTo(10)); // exactly skipInterval documents and therefore with optimization // with next tdocs.Seek(tb); Assert.IsTrue(tdocs.Next()); Assert.AreEqual(10, tdocs.Doc()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.Next()); Assert.AreEqual(11, tdocs.Doc()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.SkipTo(5)); Assert.AreEqual(12, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(15)); Assert.AreEqual(15, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(24)); Assert.AreEqual(24, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(25)); Assert.AreEqual(25, tdocs.Doc()); Assert.IsFalse(tdocs.SkipTo(26)); // without next tdocs.Seek(tb); Assert.IsTrue(tdocs.SkipTo(5)); Assert.AreEqual(10, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(15)); Assert.AreEqual(15, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(24)); Assert.AreEqual(24, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(25)); Assert.AreEqual(25, tdocs.Doc()); Assert.IsFalse(tdocs.SkipTo(26)); // much more than skipInterval documents and therefore with optimization // with next tdocs.Seek(tc); Assert.IsTrue(tdocs.Next()); Assert.AreEqual(26, tdocs.Doc()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.Next()); Assert.AreEqual(27, tdocs.Doc()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.SkipTo(5)); Assert.AreEqual(28, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(40)); Assert.AreEqual(40, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(57)); Assert.AreEqual(57, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(74)); Assert.AreEqual(74, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(75)); Assert.AreEqual(75, tdocs.Doc()); Assert.IsFalse(tdocs.SkipTo(76)); //without next tdocs.Seek(tc); Assert.IsTrue(tdocs.SkipTo(5)); Assert.AreEqual(26, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(40)); Assert.AreEqual(40, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(57)); Assert.AreEqual(57, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(74)); Assert.AreEqual(74, tdocs.Doc()); Assert.IsTrue(tdocs.SkipTo(75)); Assert.AreEqual(75, tdocs.Doc()); Assert.IsFalse(tdocs.SkipTo(76)); tdocs.Close(); reader.Close(); dir.Close(); }
private IndexerReceipt [] Flush_Unlocked(IndexerRequest request) { ArrayList receipt_queue; receipt_queue = new ArrayList(); IndexReader primary_reader, secondary_reader; primary_reader = IndexReader.Open(PrimaryStore); secondary_reader = IndexReader.Open(SecondaryStore); // Step #1: Make our first pass over the list of // indexables that make up our request. For each add // or property change in the request, get the Lucene // documents so we can move forward any persistent // properties (for adds) or all old properties (for // property changes). // // Then, for each add or remove in the request, // delete the associated documents from the index. // Note that we previously cached added documents so // that we can move persistent properties forward. // parent_child_old_props is double-nested hashtable (depth-2 tree) // indexed by the parent uri, it stores another hashtable indexed by the (parent+child documents) // FIXME: 2-level hashtable is a waste for any non-child document. // Replace this by a better data structure. Hashtable parent_child_old_props = UriFu.NewHashtable(); TermDocs term_docs = secondary_reader.TermDocs(); int delete_count = 0; IEnumerable request_indexables = request.Indexables; foreach (Indexable indexable in request_indexables) { string uri_str = UriFu.UriToEscapedString(indexable.Uri); Term term; // Store the necessary properties from old documents for re-addition if (indexable.Type == IndexableType.Add || indexable.Type == IndexableType.PropertyChange) { term = new Term("Uri", uri_str); term_docs.Seek(term); Hashtable this_parent_child_props = null; if (term_docs.Next()) { this_parent_child_props = UriFu.NewHashtable(); this_parent_child_props [indexable.Uri] = secondary_reader.Document(term_docs.Doc()); parent_child_old_props [indexable.Uri] = this_parent_child_props; } term = new Term("ParentUri", uri_str); term_docs.Seek(term); while (term_docs.Next()) { Document doc = secondary_reader.Document(term_docs.Doc()); string child_uri_str = doc.Get("Uri"); Uri child_uri = UriFu.EscapedStringToUri(child_uri_str); // Any valid lucene document *should* have a Uri, so no need to check for null // Store the child documents too, to save persistent-properties // of child documents this_parent_child_props [child_uri] = doc; } } // Now remove (non-remove indexables will be re-added in next block) Logger.Log.Debug("-{0}", indexable.DisplayUri); int num_delete = 0; term = new Term("Uri", uri_str); // For property changes, only secondary index is modified secondary_reader.DeleteDocuments(term); // Now remove from everywhere else (if asked to remove or if asked to add, in which case // we first remove and then add) // So we also need to remove child documents if (indexable.Type != IndexableType.PropertyChange) { num_delete = primary_reader.DeleteDocuments(term); // When we delete an indexable, also delete any children. // FIXME: Shouldn't we also delete any children of children, etc.? term = new Term("ParentUri", uri_str); num_delete += primary_reader.DeleteDocuments(term); secondary_reader.DeleteDocuments(term); } // If this is a strict removal (and not a deletion that // we are doing in anticipation of adding something back), // queue up a removed receipt. if (indexable.Type == IndexableType.Remove) { IndexerRemovedReceipt r; r = new IndexerRemovedReceipt(indexable.Id); r.NumRemoved = num_delete; receipt_queue.Add(r); } delete_count += num_delete; } term_docs.Close(); if (HaveItemCount) { AdjustItemCount(-delete_count); } else { SetItemCount(primary_reader); } // We are now done with the readers, so we close them. // And also free them. Somehow not freeing them is preventing them from // GCed at all. primary_reader.Close(); primary_reader = null; secondary_reader.Close(); secondary_reader = null; // FIXME: If we crash at exactly this point, we are in // trouble. Items will have been dropped from the index // without the proper replacements being added. We can // hopefully fix this when we move to Lucene 2.1. // Step #2: Make another pass across our list of indexables // and write out any new documents. if (text_cache != null) { text_cache.BeginTransaction(); } IndexWriter primary_writer, secondary_writer; // FIXME: Lock obtain time-out can happen here; if that happens, // an exception will be thrown and this method will break in the middle // leaving IndexWriters unclosed! Same for any Lucene.Net-index modification // methods. primary_writer = new IndexWriter(PrimaryStore, IndexingAnalyzer, false); secondary_writer = null; foreach (Indexable indexable in request_indexables) { // If shutdown has been started, break here // FIXME: Some more processing will continue, a lot of them // concerning receipts, but the daemon will anyway ignore receipts // now, what is the fastest way to stop from here ? if (Shutdown.ShutdownRequested) { Log.Debug("Shutdown initiated. Breaking while flushing indexables."); break; } // Receipts for removes were generated in the // previous block. Now we just have to remove // items from the text cache. if (indexable.Type == IndexableType.Remove) { if (text_cache != null) { text_cache.Delete(indexable.Uri); } continue; } IndexerAddedReceipt r; Hashtable prop_change_docs = (Hashtable)parent_child_old_props [indexable.Uri]; if (indexable.Type == IndexableType.PropertyChange) { Logger.Log.Debug("+{0} (props only)", indexable.DisplayUri); r = new IndexerAddedReceipt(indexable.Id); r.PropertyChangesOnly = true; receipt_queue.Add(r); Document doc; if (prop_change_docs == null) { doc = null; } else { doc = (Document)prop_change_docs [indexable.Uri]; } Document new_doc; new_doc = RewriteDocument(doc, indexable); // Write out the new document... if (secondary_writer == null) { secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false); } secondary_writer.AddDocument(new_doc); // Get child property change indexables... ArrayList prop_change_indexables; prop_change_indexables = GetChildPropertyChange(prop_change_docs, indexable); // and store them; no need to delete them first, since they were already removed from the index if (prop_change_indexables == null) { continue; } foreach (Indexable prop_change_indexable in prop_change_indexables) { Log.Debug("+{0} (props only, generated indexable)", prop_change_indexable.Uri); doc = (Document)prop_change_docs [prop_change_indexable.Uri]; new_doc = RewriteDocument(doc, prop_change_indexable); secondary_writer.AddDocument(new_doc); } continue; // ...and proceed to the next Indexable } // If we reach this point we know we are dealing with an IndexableType.Add if (indexable.Type != IndexableType.Add) { throw new Exception("When I said it was an IndexableType.Add, I meant it!"); } r = AddIndexableToIndex(indexable, primary_writer, ref secondary_writer, prop_change_docs); if (r != null) { receipt_queue.Add(r); } } if (text_cache != null) { text_cache.CommitTransaction(); } if (Shutdown.ShutdownRequested) { foreach (DeferredInfo di in deferred_indexables) { di.Cleanup(); } deferred_indexables.Clear(); foreach (Indexable indexable in request_indexables) { indexable.Cleanup(); } primary_writer.Close(); if (secondary_writer != null) { secondary_writer.Close(); } return(null); } if (request.OptimizeIndex) { Stopwatch watch = new Stopwatch(); Logger.Log.Debug("Optimizing {0}", IndexName); watch.Start(); primary_writer.Optimize(); if (secondary_writer == null) { secondary_writer = new IndexWriter(SecondaryStore, IndexingAnalyzer, false); } secondary_writer.Optimize(); watch.Stop(); Logger.Log.Debug("{0} optimized in {1}", IndexName, watch); } // Step #4. Close our writers and return the events to // indicate what has happened. primary_writer.Close(); if (secondary_writer != null) { secondary_writer.Close(); } // Send a single IndexerIndexablesReceipt if there were deferred indexables if (deferred_indexables.Count > 0) { Log.Debug("{0} indexables generated more indexables; asking daemon to schedule their indexing.", deferred_indexables.Count); IndexerIndexablesReceipt r = new IndexerIndexablesReceipt(); receipt_queue.Add(r); } IndexerReceipt [] receipt_array; receipt_array = new IndexerReceipt [receipt_queue.Count]; for (int i = 0; i < receipt_queue.Count; ++i) { receipt_array [i] = (IndexerReceipt)receipt_queue [i]; } return(receipt_array); }
public override void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = System.Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); BigNestedIntArray.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null); TermEnum tenum = null; TermDocs tdoc = null; var list = (listFactory == null ? new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; string pre = null; int df = 0; int minID = -1; int maxID = -1; int valId = 0; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { int weight = 0; string[] split = val.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries); if (split.Length > 1) { val = split[0]; weight = int.Parse(split[split.Length - 1]); } if (pre == null || !val.Equals(pre)) { if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } list.Add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } tdoc.Seek(tenum); if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } if (docid < minID) { minID = docid; } bitset.FastSet(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } else { weightLoader.Add(docid, weight); } bitset.FastSet(docid); } if (docid > maxID) { maxID = docid; } } pre = val; } }while (tenum.Next()); if (pre != null) { freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); _weightArray.Load(maxdoc + 1, weightLoader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
/// <summary> /// loads multi-value facet data. This method uses a workarea to prepare loading. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <param name="listFactory"></param> /// <param name="workArea"></param> public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory, BoboIndexReader.WorkArea workArea) { long t0 = Environment.TickCount; int maxdoc = reader.MaxDoc; BigNestedIntArray.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea); TermEnum tenum = null; TermDocs tdoc = null; ITermValueList list = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList()); List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); OpenBitSet bitset = new OpenBitSet(); int negativeValueCount = GetNegativeValueCount(reader, string.Intern(fieldName)); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); t++; _overflow = false; try { tdoc = reader.TermDocs(); tenum = reader.Terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.Term; if (term == null || !fieldName.Equals(term.Field)) { break; } string val = term.Text; if (val != null) { list.Add(val); tdoc.Seek(tenum); //freqList.add(tenum.docFreq()); // removed because the df doesn't take into account the num of deletedDocs int df = 0; int minID = -1; int maxID = -1; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (tdoc.Next()) { df++; int docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } minID = docid; bitset.Set(docid); while (tdoc.Next()) { df++; docid = tdoc.Doc; if (!loader.Add(docid, valId)) { LogOverflow(fieldName); } bitset.Set(docid); } maxID = docid; } freqList.Add(df); minIDList.Add(minID); maxIDList.Add(maxID); } t++; }while (tenum.Next()); } } finally { try { if (tdoc != null) { tdoc.Dispose(); } } finally { if (tenum != null) { tenum.Dispose(); } } } list.Seal(); try { _nestedArray.Load(maxdoc + 1, loader); } catch (System.IO.IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.ToString(), e); } this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.Contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.Contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int)bitset.Cardinality(); }
public virtual void Load(string fieldName, IndexReader reader, TermListFactory listFactory) { string field = string.Intern(fieldName); int maxDoc = reader.MaxDoc; BigSegmentedArray order = this.orderArray; if (order == null) // we want to reuse the memory { int dictValueCount = GetDictValueCount(reader, fieldName); order = NewInstance(dictValueCount, maxDoc); } else { order.EnsureCapacity(maxDoc); // no need to fill to 0, we are reseting the // data anyway } this.orderArray = order; List <int> minIDList = new List <int>(); List <int> maxIDList = new List <int>(); List <int> freqList = new List <int>(); int length = maxDoc + 1; ITermValueList list = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList(); int negativeValueCount = GetNegativeValueCount(reader, field); TermDocs termDocs = reader.TermDocs(); TermEnum termEnum = reader.Terms(new Term(field, "")); int t = 0; // current term number list.Add(null); minIDList.Add(-1); maxIDList.Add(-1); freqList.Add(0); int totalFreq = 0; //int df = 0; t++; try { do { Term term = termEnum.Term; if (term == null || string.CompareOrdinal(term.Field, field) != 0) { break; } // store term text // we expect that there is at most one term per document // Alexey: well, we could get now more than one term per document. Effectively, we could build facet against tokenized field //if (t >= length) //{ // throw new RuntimeException("there are more terms than " + "documents in field \"" + field // + "\", but it's impossible to sort on " + "tokenized fields"); //} list.Add(term.Text); termDocs.Seek(termEnum); // freqList.add(termEnum.docFreq()); // doesn't take into account deldocs int minID = -1; int maxID = -1; int df = 0; int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; if (termDocs.Next()) { df++; int docid = termDocs.Doc; order.Add(docid, valId); minID = docid; while (termDocs.Next()) { df++; docid = termDocs.Doc; order.Add(docid, valId); } maxID = docid; } freqList.Add(df); totalFreq += df; minIDList.Add(minID); maxIDList.Add(maxID); t++; } while (termEnum.Next()); } finally { termDocs.Dispose(); termEnum.Dispose(); } list.Seal(); this.valArray = list; this.freqs = freqList.ToArray(); this.minIDs = minIDList.ToArray(); this.maxIDs = maxIDList.ToArray(); int doc = 0; while (doc <= maxDoc && order.Get(doc) != 0) { ++doc; } if (doc <= maxDoc) { this.minIDs[0] = doc; // Try to get the max doc = maxDoc; while (doc > 0 && order.Get(doc) != 0) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxDoc + 1 - totalFreq; }