/// <summary> /// 查找指定数目的Term /// </summary> /// <param name="num"></param> /// <returns></returns> public TermModel[] FindTerms(int num) { num++; TermInfoQueue queue = new TermInfoQueue(num); TermEnum enum2 = open.Reader.Terms(); int count = 0; while (enum2.Next()) { string str = enum2.Term().Field(); if ((currentField != null) && (!str.Equals(currentField))) { continue; } if (enum2.DocFreq() > count) { queue.Put(new TermModel(enum2.Term(), enum2.DocFreq())); if (queue.Size() < num) { continue; } queue.Pop(); count = ((TermModel)queue.Top()).Count; } } enum2.Close(); TermModel[] modleArray = new TermModel[queue.Size()]; for (int i = 0; i < modleArray.Length; i++) { modleArray[(modleArray.Length - i) - 1] = (TermModel)queue.Pop(); } return(modleArray); }
public override BitArray Bits(IndexReader reader) { // reader.GetVersion could be used to cache // Debug.WriteLine(reader.GetVersion()); // could be used to cache // if (cached reader == reader && _revFirst == if (_revFirst == All || _revLast == All) // optimization { return(new BitArray(reader.MaxDoc(), true)); } BitArray last_bits = new BitArray(reader.MaxDoc(), false); TermEnum t = reader.Terms(new Term(FieldName.RevisionLast, _revFirst.ToString(RevFormat))); TermDocs d = reader.TermDocs(); //if (t.SkipTo((new Term(FieldName.RevisionLast, revision.ToString(RevFormat))))) // extremely slow if (t.Term() != null) { while (t.Term().Field() == FieldName.RevisionLast) { d.Seek(t); while (d.Next()) { last_bits[d.Doc()] = true; } if (!t.Next()) { break; } } } // optimization, skip if we just using the head revision if (_revLast == Head) { return(last_bits); } BitArray first_bits = new BitArray(reader.MaxDoc(), true); t = reader.Terms(new Term("rev_first", (_revLast + 1).ToString(RevFormat))); //if (t.SkipTo((new Term("rev_first", (revision + 1).ToString(RevFormat))))) // extremely slow if (t.Term() != null) { while (t.Term().Field() == "rev_first") { d.Seek(t); while (d.Next()) { first_bits[d.Doc()] = false; } if (!t.Next()) { break; } } } return(last_bits.And(first_bits)); }
static void CheckIsHeadOnly(IndexSearcher searcher) { TermEnum t = searcher.Reader.Terms(new Term(FieldName.RevisionLast, "0")); Assert.IsNotNull(t); Assert.AreEqual(FieldName.RevisionLast, t.Term().Field()); while (t.Term().Field() == FieldName.RevisionLast) { Assert.AreEqual(Revision.HeadString, t.Term().Text()); if (t.Next()) { continue; } } }
private int[] docMap; // use getDocMap() internal SegmentMergeInfo(int b, TermEnum te, IndexReader r) { base_Renamed = b; reader = r; termEnum = te; term = te.Term(); }
public IEnumerable <TermInfo> GetTerms() { var directory = _openIndexModel.Directory; IndexReader indexReader = null; TermEnum terms = null; try { indexReader = IndexReader.Open(directory, true); // ToDo should i open this only once terms = indexReader.Terms(); while (terms.Next()) { System.Threading.Thread.Sleep(2); var term = terms.Term(); yield return(new TermInfo { Term = term.Text(), Field = term.Field(), Frequency = terms.DocFreq() }); } } finally { if (indexReader != null) { indexReader.Close(); } if (terms != null) { terms.Close(); } } yield break; }
/// <summary> /// 查找指定字段Term /// </summary> /// <param name="field"></param> /// <param name="text"></param> /// <param name="current"></param> /// <returns></returns> public TermModel FindTerm(string field, string text, bool current) { TermEnum enum2 = open.Reader.Terms(); if (enum2.SkipTo(new Term(field, text))) { TermModel modle2 = null; while ((!current && enum2.Next() && field.Equals(enum2.Term().Field())) || current) { modle2 = new TermModel(enum2.Term(), enum2.DocFreq()); break; } enum2.Close(); return(modle2); } return(null); }
private OpenBitSet FastBits(IndexReader reader) { OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); bits.Set(0, reader.MaxDoc()); //assume all are valid Term startTerm = new Term(fieldName); TermEnum te = reader.Terms(startTerm); if (te != null) { Term currTerm = te.Term(); while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned { if (te.DocFreq() > 1) { int lastDoc = -1; //unset potential duplicates TermDocs td = reader.TermDocs(currTerm); td.Next(); if (keepMode == KM_USE_FIRST_OCCURRENCE) { td.Next(); } do { lastDoc = td.Doc(); bits.Clear(lastDoc); } while (td.Next()); if (keepMode == KM_USE_LAST_OCCURRENCE) { //restore the last bit bits.Set(lastDoc); } } if (!te.Next()) { break; } currTerm = te.Term(); } } return(bits); }
public void DisplayInternalIndex() { Directory mainIndexDir = SearchFactory.GetSearchFactory(sessions).GetDirectoryProvider(typeof(Book)).Directory; IndexReader reader = IndexReader.Open(mainIndexDir); TermEnum terms = reader.Terms(); while (terms.Next()) { Term term = terms.Term(); log.Debug("In " + term.Field() + ": " + term.Text()); } }
public int Get(string path) { int revision; lock (_highest) { if (_highest.TryGetValue(path, out revision)) { return(revision); } } if (Reader == null) { return(0); } path += "@"; TermEnum t = Reader.Terms(new Term(FieldName.Id, path)); int doc = -1; while (t.Term() != null && t.Term().Text().StartsWith(path)) { int r = int.Parse(t.Term().Text().Substring(path.Length)); if (r > revision) { revision = r; TermDocs d = Reader.TermDocs(t.Term()); d.Next(); doc = d.Doc(); } t.Next(); } t.Close(); if (revision != 0 && Reader.Document(doc).Get(FieldName.RevisionLast) != Revision.HeadString) { return(0); } return(revision); }
internal bool Next() { if (termEnum.Next()) { term = termEnum.Term(); return(true); } else { term = null; return(false); } }
private OpenBitSet CorrectBits(IndexReader reader) { OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); //assume all are INvalid Term startTerm = new Term(fieldName); TermEnum te = reader.Terms(startTerm); if (te != null) { Term currTerm = te.Term(); while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned { int lastDoc = -1; //set non duplicates TermDocs td = reader.TermDocs(currTerm); if (td.Next()) { if (keepMode == KM_USE_FIRST_OCCURRENCE) { bits.Set(td.Doc()); } else { do { lastDoc = td.Doc(); } while (td.Next()); bits.Set(lastDoc); } } if (!te.Next()) { break; } currTerm = te.Term(); } } return(bits); }
private IEnumerable <string> GetFieldValues(IndexReader reader, string groupByField) { TermEnum te = reader.Terms(new Term(groupByField, string.Empty)); if (te.Term() == null || te.Term().Field() != groupByField) { return(Enumerable.Empty <string>()); } var list = new List <string>(); list.Add(te.Term().Text()); while (te.Next()) { if (te.Term().Field() != groupByField) { break; } list.Add(te.Term().Text()); } return(list); }
private void buttonFirstTerm_Click(object sender, System.EventArgs e) { if (_luke.IndexReader == null) { _luke.ShowStatus(_luke.resources.GetString("NoIndex")); return; } try { TermEnum te = _luke.IndexReader.Terms(); te.Next(); Term t = te.Term(); _ShowTerm(t); } catch (Exception exc) { _luke.ShowStatus(exc.Message); } }
public virtual void TestKnownSetOfDocuments() { System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" }; System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader.Terms(); TermDocs termDocs = knownSearcher.reader.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i]) == true) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("Field", "chocolate")); Hits hits = knownSearcher.Search(query); //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length() == 3); float score = hits.Score(0); /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString())); Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString())); Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString())); TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = (System.Int32)test4Map[term]; System.Object tmpFreqInt = test4Map[term]; Assert.IsTrue(tmpFreqInt != null); Assert.IsTrue(freqInt == freq); } knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects Hashtable processedTerms = new Hashtable(); while (ts.IncrementToken()) { String term = termAtt.Term(); if (!processedTerms.Contains(term)) { processedTerms.Add(term, term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term())) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term(); if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.Insert(st); minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore } } }while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.Insert(st); } } } } }
void BeginAsyncReconstruction(int docNum, Document document, Hashtable doc) { // get stored fields ArrayList sf = new ArrayList(); for (int i = 0; i < _indexFields.Length; i++) { Field[] f = document.GetFields(_indexFields[i]); if (f == null || f.Length == 0 || !f[0].IsStored()) { continue; } StringBuilder sb = new StringBuilder(); for (int k = 0; k < f.Length; k++) { if (k > 0) { sb.Append('\n'); } sb.Append(f[k].StringValue()); } Field field = Legacy.CreateField(_indexFields[i], sb.ToString(), f[0].IsStored(), f[0].IsIndexed(), f[0].IsTokenized(), f[0].IsTermVectorStored()); field.SetBoost(f[0].GetBoost()); doc[_indexFields[i]] = field; sf.Add(_indexFields[i]); } String term = null; GrowableStringArray terms = null; try { int i = 0; int delta = (int)Math.Ceiling(((double)_numTerms / 100)); TermEnum te = _luke.IndexReader.Terms(); TermPositions tp = _luke.IndexReader.TermPositions(); while (te.Next()) { if ((i++ % delta) == 0) { // update UI - async UpdateProgress(i / delta); } // skip stored fields if (sf.Contains(te.Term().Field())) { continue; } tp.Seek(te.Term()); if (!tp.SkipTo(docNum) || tp.Doc() != docNum) { // this term is not found in the doc continue; } term = te.Term().Text(); terms = (GrowableStringArray)doc[te.Term().Field()]; if (terms == null) { terms = new GrowableStringArray(); doc[te.Term().Field()] = terms; } for (int k = 0; k < tp.Freq(); k++) { int pos = tp.NextPosition(); terms.Set(pos, term); } } } catch (Exception exc) { // Update UI - async _luke.ShowStatus(exc.Message); } }
public Term Term() { Term t = termEnum.Term(); return(t != null && t.Field() == fieldName ? t : null); }
public virtual void TestKnownSetOfDocuments() { System.String test1 = "eating chocolate in a computer lab"; //6 terms System.String test2 = "computer in a computer lab"; //5 terms System.String test3 = "a chocolate lab grows old"; //5 terms System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms System.Collections.IDictionary test4Map = new System.Collections.Hashtable(); test4Map["chocolate"] = 3; test4Map["lab"] = 2; test4Map["eating"] = 1; test4Map["computer"] = 1; test4Map["with"] = 1; test4Map["a"] = 1; test4Map["colored"] = 1; test4Map["in"] = 1; test4Map["an"] = 1; test4Map["computer"] = 1; test4Map["old"] = 1; Document testDoc1 = new Document(); SetupDoc(testDoc1, test1); Document testDoc2 = new Document(); SetupDoc(testDoc2, test2); Document testDoc3 = new Document(); SetupDoc(testDoc3, test3); Document testDoc4 = new Document(); SetupDoc(testDoc4, test4); Directory dir = new MockRAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Assert.IsTrue(writer != null); writer.AddDocument(testDoc1); writer.AddDocument(testDoc2); writer.AddDocument(testDoc3); writer.AddDocument(testDoc4); writer.Close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms(); TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.GetSimilarity(); while (termEnum.Next() == true) { Term term = termEnum.Term(); //System.out.println("Term: " + term); termDocs.Seek(term); while (termDocs.Next()) { int docId = termDocs.Doc(); int freq = termDocs.Freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field"); float tf = sim.Tf(freq); float idf = sim.Idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.LengthNorm("field", vector.GetTerms().Length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); Assert.IsTrue(vector != null); System.String[] vTerms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int i = 0; i < vTerms.Length; i++) { if (term.Text().Equals(vTerms[i])) { Assert.IsTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("field", "chocolate")); ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).scoreDocs; //doc 3 should be the first hit b/c it is the shortest match Assert.IsTrue(hits.Length == 3); float score = hits[0].score; /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ Assert.IsTrue(hits[0].doc == 2); Assert.IsTrue(hits[1].doc == 3); Assert.IsTrue(hits[2].doc == 0); TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field"); Assert.IsTrue(vector2 != null); //System.out.println("Vector: " + vector); System.String[] terms = vector2.GetTerms(); int[] freqs2 = vector2.GetTermFrequencies(); Assert.IsTrue(terms != null && terms.Length == 10); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs2[i]; Assert.IsTrue(test4.IndexOf(term) != -1); System.Int32 freqInt = -1; try { freqInt = (System.Int32)test4Map[term]; } catch (Exception) { Assert.IsTrue(false); } Assert.IsTrue(freqInt == freq); } SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper); System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet(); Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10); TermVectorEntry last = null; foreach (TermVectorEntry tve in vectorEntrySet.Keys) { if (tve != null && last != null) { Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted"); System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()]; //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:"); } last = tve; } FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper); System.Collections.IDictionary map = fieldMapper.GetFieldToTerms(); Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2); vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"]; Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be"); Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10); knownSearcher.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
public void End(bool shouldClose) { if (!_is_started) { return; } if (!shouldClose) { return; } //build 2del file list if (!_job_status.Cancelled) { TermEnum term_enum = _index_reader.Terms(); Term path_term = new Term("path"); int nb_terms = 0; while (term_enum.SkipTo(path_term)) //skip to new term equal or *ABOVE* "path:" !!! { Term term = term_enum.Term(); if (term.Field() != path_term.Field()) { break; } if (!File.Exists(term.Text())) { _del_file_list.Add(term.Text()); } if (_job_status.Cancelled) { break; } nb_terms++; } term_enum.Close(); Logger.Log.Info("update: deletion: {0} analyzed terms, found {1} vanished files.", nb_terms, _del_file_list.Count); } _index_searcher.Close(); _index_reader.Close(); //--- deleting deprecated if ((_del_file_list.Count > 0) && (!_job_status.Cancelled)) { Stopwatch watch = new Stopwatch(); watch.Start(); int num_file = 0; int nb_files = _del_file_list.Count; IndexWriter writer = new IndexWriter(_index_path, _default_analyzer, false); foreach (string path in _del_file_list) { if (((num_file++) % 101) == 1) { int progress = ((((num_file++) + 1)) * 100) / nb_files; _job_status.Progress = progress; _job_status.Description = String.Format("upd: removing (from index) file {0}/{1} - {2}", num_file, _del_file_list.Count, StringFu.TimeSpanToString(new TimeSpan((long)(watch.ElapsedMilliseconds) * 10000))); } if (_job_status.Cancelled) { break; } writer.DeleteDocuments(new Term("path", path)); } writer.Commit(); writer.Close(); watch.Stop(); } //adding new files if ((_add_file_list.Count > 0) && (!_job_status.Cancelled)) { Stopwatch watch = new Stopwatch(); watch.Start(); IndexWriter writer = null; try { writer = new IndexWriter(_index_path, _default_analyzer, false, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)); int num_file = 0; int nb_files = _add_file_list.Count; foreach (BasicFileInfo fi in _add_file_list) { if (((num_file++) % 101) == 1) { int progress = ((((num_file++) + 1)) * 100) / nb_files; _job_status.Progress = progress; _job_status.Description = String.Format("upd: indexing new file {0}/{1} - {2}", num_file, _add_file_list.Count, StringFu.TimeSpanToString(new TimeSpan((long)(watch.ElapsedMilliseconds) * 10000))); } if (_job_status.Cancelled) { break; } writer.AddDocument(_doc_factory.CreateFromPath(fi.FilePath, fi.LastModification)); if (num_file % 20 == 0) { writer.Commit(); } } writer.Commit(); } catch (System.Exception ex) { Log.Error(ex); } finally { if (writer != null) { writer.Close(); writer = null; } } watch.Stop(); } //updating modified files if ((_upd_file_list.Count > 0) && (!_job_status.Cancelled)) { Stopwatch watch = new Stopwatch(); watch.Start(); int num_file = 0; int nb_files = _upd_file_list.Count; IndexWriter writer = null; try { writer = new IndexWriter(_index_path, _default_analyzer, false, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)); foreach (BasicFileInfo fi in _upd_file_list) { if (((num_file++) % 101) == 1) { int progress = ((((num_file++) + 1)) * 100) / nb_files; _job_status.Progress = progress; _job_status.Description = String.Format("upd: modified file {0}/{1} - {2}", num_file, _upd_file_list.Count, StringFu.TimeSpanToString(new TimeSpan((long)(watch.ElapsedMilliseconds) * 10000))); } if (_job_status.Cancelled) { break; } writer.UpdateDocument(new Term("path", fi.FilePath), _doc_factory.CreateFromPath(fi.FilePath, fi.LastModification)); } writer.Commit(); //LittleBeagle.Properties.Settings.Default.NbIndexedFiles = num_file; } catch (System.Exception ex) { Log.Error(ex); } finally { if (writer != null) { writer.Close(); writer = null; } } watch.Stop(); } }
public static TermInfo[] GetHighFreqTerms(Directory dir, Hashtable junkWords, int numTerms, String[] fields) { if (dir == null || fields == null) { return(new TermInfo[0]); } IndexReader reader = IndexReader.Open(dir, true); TermInfoQueue tiq = new TermInfoQueue(numTerms); TermEnum terms = reader.Terms(); int minFreq = 0; while (terms.Next()) { String field = terms.Term().Field(); if (fields != null && fields.Length > 0) { bool skip = true; for (int i = 0; i < fields.Length; i++) { if (field.Equals(fields[i])) { skip = false; break; } } if (skip) { continue; } } if (junkWords != null && junkWords[terms.Term().Text()] != null) { continue; } if (terms.DocFreq() > minFreq) { TermInfo top = (TermInfo)tiq.Add(new TermInfo(terms.Term(), terms.DocFreq())); if (tiq.Size() >= numTerms) // if tiq overfull { tiq.Pop(); // remove lowest in tiq minFreq = top.DocFreq; // reset minFreq } } } TermInfo[] res = new TermInfo[tiq.Size()]; for (int i = 0; i < res.Length; i++) { res[res.Length - i - 1] = (TermInfo)tiq.Pop(); } reader.Close(); return(res); }
// There are two ways we can determine the max_results // most recent items: // // One is to instantiate Lucene documents for each of // the document IDs in primary_matches. This is a // fairly expensive operation. // // The other is to walk through the list of all // document IDs in descending time order. This is // a less expensive operation, but adds up over time // on large data sets. // // We can walk about 2.5 docs for every Document we // instantiate. So what we'll do, if we have more // matches than available hits, is walk (m * 1.25) // docs to see if we can fill out the top 100 hits. // If not, we'll fall back to creating documents // for all of them. private static ArrayList ScanRecentDocs(IndexReader primary_reader, IndexReader secondary_reader, BetterBitArray primary_matches, Dictionary <int, Hit> hits_by_id, int max_results, ref int total_number_of_matches, HitFilter hit_filter, string index_name) { Stopwatch a = new Stopwatch(); a.Start(); TermDocs docs = primary_reader.TermDocs(); TermEnum enumerator = primary_reader.Terms(new Term("InvertedTimestamp", String.Empty)); ArrayList results = new ArrayList(max_results); int docs_found = 0; int docs_walked = 0; int hit_filter_removed = 0; int max_docs = (int)(primary_matches.TrueCount * 1.25); Term term; TermDocs secondary_term_docs = null; if (secondary_reader != null) { secondary_term_docs = secondary_reader.TermDocs(); } do { term = enumerator.Term(); if (term.Field() != "InvertedTimestamp") { break; } docs.Seek(enumerator); while (docs.Next() && docs_found < max_results && docs_walked < max_docs) { int doc_id = docs.Doc(); if (primary_matches.Get(doc_id)) { Document doc = primary_reader.Document(doc_id); Hit hit = CreateHit(doc, secondary_reader, secondary_term_docs); // If we have a HitFilter, apply it. if (hit_filter != null && !hit_filter(hit)) { if (Debug) { Log.Debug("Filtered out {0}", hit.Uri); } hit_filter_removed++; continue; } hits_by_id [doc_id] = hit; // Add the result, last modified first results.Add(hit); docs_found++; } docs_walked++; } } while (enumerator.Next() && docs_found < max_results && docs_walked < max_docs); docs.Close(); if (secondary_term_docs != null) { secondary_term_docs.Close(); } // If we've found all the docs we can return in a subset! // Fantastic, we've probably short circuited a slow search. if (docs_found != max_results) { // Otherwise bad luck! Not all docs found // Start afresh - this time traversing all results results = null; } else { // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following: // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned. // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user. total_number_of_matches -= hit_filter_removed; } a.Stop(); if (Debug) { Log.Debug(">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a); if (docs_found == max_results) { Log.Debug(">>> {0}: Successfully short circuited timestamp ordering!", index_name); } } return(results); }