コード例 #1
0
        /// <summary>
        /// 查找指定数目的Term
        /// </summary>
        /// <param name="num"></param>
        /// <returns></returns>
        public TermModel[] FindTerms(int num)
        {
            num++;
            TermInfoQueue queue = new TermInfoQueue(num);
            TermEnum      enum2 = open.Reader.Terms();
            int           count = 0;

            while (enum2.Next())
            {
                string str = enum2.Term().Field();
                if ((currentField != null) && (!str.Equals(currentField)))
                {
                    continue;
                }
                if (enum2.DocFreq() > count)
                {
                    queue.Put(new TermModel(enum2.Term(), enum2.DocFreq()));
                    if (queue.Size() < num)
                    {
                        continue;
                    }
                    queue.Pop();
                    count = ((TermModel)queue.Top()).Count;
                }
            }
            enum2.Close();
            TermModel[] modleArray = new TermModel[queue.Size()];
            for (int i = 0; i < modleArray.Length; i++)
            {
                modleArray[(modleArray.Length - i) - 1] = (TermModel)queue.Pop();
            }
            return(modleArray);
        }
コード例 #2
0
        public override BitArray Bits(IndexReader reader)
        {
            // reader.GetVersion could be used to cache
            // Debug.WriteLine(reader.GetVersion()); // could be used to cache
            // if (cached reader == reader && _revFirst ==

            if (_revFirst == All || _revLast == All) // optimization
            {
                return(new BitArray(reader.MaxDoc(), true));
            }

            BitArray last_bits = new BitArray(reader.MaxDoc(), false);

            TermEnum t = reader.Terms(new Term(FieldName.RevisionLast, _revFirst.ToString(RevFormat)));
            TermDocs d = reader.TermDocs();

            //if (t.SkipTo((new Term(FieldName.RevisionLast, revision.ToString(RevFormat))))) // extremely slow
            if (t.Term() != null)
            {
                while (t.Term().Field() == FieldName.RevisionLast)
                {
                    d.Seek(t);
                    while (d.Next())
                    {
                        last_bits[d.Doc()] = true;
                    }
                    if (!t.Next())
                    {
                        break;
                    }
                }
            }

            // optimization, skip if we just using the head revision
            if (_revLast == Head)
            {
                return(last_bits);
            }

            BitArray first_bits = new BitArray(reader.MaxDoc(), true);

            t = reader.Terms(new Term("rev_first", (_revLast + 1).ToString(RevFormat)));
            //if (t.SkipTo((new Term("rev_first", (revision + 1).ToString(RevFormat))))) // extremely slow
            if (t.Term() != null)
            {
                while (t.Term().Field() == "rev_first")
                {
                    d.Seek(t);
                    while (d.Next())
                    {
                        first_bits[d.Doc()] = false;
                    }
                    if (!t.Next())
                    {
                        break;
                    }
                }
            }
            return(last_bits.And(first_bits));
        }
コード例 #3
0
ファイル: IndexerTests.cs プロジェクト: wlshitou/svnquery
        static void CheckIsHeadOnly(IndexSearcher searcher)
        {
            TermEnum t = searcher.Reader.Terms(new Term(FieldName.RevisionLast, "0"));

            Assert.IsNotNull(t);
            Assert.AreEqual(FieldName.RevisionLast, t.Term().Field());
            while (t.Term().Field() == FieldName.RevisionLast)
            {
                Assert.AreEqual(Revision.HeadString, t.Term().Text());
                if (t.Next())
                {
                    continue;
                }
            }
        }
コード例 #4
0
        private int[] docMap;           // use getDocMap()

        internal SegmentMergeInfo(int b, TermEnum te, IndexReader r)
        {
            base_Renamed = b;
            reader       = r;
            termEnum     = te;
            term         = te.Term();
        }
コード例 #5
0
        public IEnumerable <TermInfo> GetTerms()
        {
            var         directory   = _openIndexModel.Directory;
            IndexReader indexReader = null;
            TermEnum    terms       = null;

            try
            {
                indexReader = IndexReader.Open(directory, true); // ToDo should i open this only once
                terms       = indexReader.Terms();

                while (terms.Next())
                {
                    System.Threading.Thread.Sleep(2);
                    var term = terms.Term();
                    yield return(new TermInfo {
                        Term = term.Text(), Field = term.Field(), Frequency = terms.DocFreq()
                    });
                }
            }
            finally
            {
                if (indexReader != null)
                {
                    indexReader.Close();
                }

                if (terms != null)
                {
                    terms.Close();
                }
            }

            yield break;
        }
コード例 #6
0
        /// <summary>
        /// 查找指定字段Term
        /// </summary>
        /// <param name="field"></param>
        /// <param name="text"></param>
        /// <param name="current"></param>
        /// <returns></returns>
        public TermModel FindTerm(string field, string text, bool current)
        {
            TermEnum enum2 = open.Reader.Terms();

            if (enum2.SkipTo(new Term(field, text)))
            {
                TermModel modle2 = null;
                while ((!current && enum2.Next() && field.Equals(enum2.Term().Field())) ||
                       current)
                {
                    modle2 = new TermModel(enum2.Term(), enum2.DocFreq());
                    break;
                }
                enum2.Close();
                return(modle2);
            }
            return(null);
        }
コード例 #7
0
        private OpenBitSet FastBits(IndexReader reader)
        {
            OpenBitSet bits = new OpenBitSet(reader.MaxDoc());

            bits.Set(0, reader.MaxDoc()); //assume all are valid
            Term     startTerm = new Term(fieldName);
            TermEnum te        = reader.Terms(startTerm);

            if (te != null)
            {
                Term currTerm = te.Term();

                while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned
                {
                    if (te.DocFreq() > 1)
                    {
                        int lastDoc = -1;
                        //unset potential duplicates
                        TermDocs td = reader.TermDocs(currTerm);
                        td.Next();
                        if (keepMode == KM_USE_FIRST_OCCURRENCE)
                        {
                            td.Next();
                        }
                        do
                        {
                            lastDoc = td.Doc();
                            bits.Clear(lastDoc);
                        } while (td.Next());
                        if (keepMode == KM_USE_LAST_OCCURRENCE)
                        {
                            //restore the last bit
                            bits.Set(lastDoc);
                        }
                    }
                    if (!te.Next())
                    {
                        break;
                    }
                    currTerm = te.Term();
                }
            }
            return(bits);
        }
コード例 #8
0
        public void DisplayInternalIndex()
        {
            Directory   mainIndexDir = SearchFactory.GetSearchFactory(sessions).GetDirectoryProvider(typeof(Book)).Directory;
            IndexReader reader       = IndexReader.Open(mainIndexDir);
            TermEnum    terms        = reader.Terms();

            while (terms.Next())
            {
                Term term = terms.Term();
                log.Debug("In " + term.Field() + ": " + term.Text());
            }
        }
コード例 #9
0
ファイル: HighestRevision.cs プロジェクト: wlshitou/svnquery
        public int Get(string path)
        {
            int revision;

            lock (_highest)
            {
                if (_highest.TryGetValue(path, out revision))
                {
                    return(revision);
                }
            }

            if (Reader == null)
            {
                return(0);
            }
            path += "@";
            TermEnum t   = Reader.Terms(new Term(FieldName.Id, path));
            int      doc = -1;

            while (t.Term() != null && t.Term().Text().StartsWith(path))
            {
                int r = int.Parse(t.Term().Text().Substring(path.Length));
                if (r > revision)
                {
                    revision = r;
                    TermDocs d = Reader.TermDocs(t.Term());
                    d.Next();
                    doc = d.Doc();
                }
                t.Next();
            }
            t.Close();
            if (revision != 0 && Reader.Document(doc).Get(FieldName.RevisionLast) != Revision.HeadString)
            {
                return(0);
            }
            return(revision);
        }
コード例 #10
0
 internal bool Next()
 {
     if (termEnum.Next())
     {
         term = termEnum.Term();
         return(true);
     }
     else
     {
         term = null;
         return(false);
     }
 }
コード例 #11
0
        private OpenBitSet CorrectBits(IndexReader reader)
        {
            OpenBitSet bits      = new OpenBitSet(reader.MaxDoc()); //assume all are INvalid
            Term       startTerm = new Term(fieldName);
            TermEnum   te        = reader.Terms(startTerm);

            if (te != null)
            {
                Term currTerm = te.Term();
                while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned
                {
                    int lastDoc = -1;
                    //set non duplicates
                    TermDocs td = reader.TermDocs(currTerm);
                    if (td.Next())
                    {
                        if (keepMode == KM_USE_FIRST_OCCURRENCE)
                        {
                            bits.Set(td.Doc());
                        }
                        else
                        {
                            do
                            {
                                lastDoc = td.Doc();
                            } while (td.Next());
                            bits.Set(lastDoc);
                        }
                    }
                    if (!te.Next())
                    {
                        break;
                    }
                    currTerm = te.Term();
                }
            }
            return(bits);
        }
コード例 #12
0
        private IEnumerable <string> GetFieldValues(IndexReader reader, string groupByField)
        {
            TermEnum te = reader.Terms(new Term(groupByField, string.Empty));

            if (te.Term() == null || te.Term().Field() != groupByField)
            {
                return(Enumerable.Empty <string>());
            }

            var list = new List <string>();

            list.Add(te.Term().Text());

            while (te.Next())
            {
                if (te.Term().Field() != groupByField)
                {
                    break;
                }

                list.Add(te.Term().Text());
            }
            return(list);
        }
コード例 #13
0
 private void buttonFirstTerm_Click(object sender, System.EventArgs e)
 {
     if (_luke.IndexReader == null)
     {
         _luke.ShowStatus(_luke.resources.GetString("NoIndex"));
         return;
     }
     try
     {
         TermEnum te = _luke.IndexReader.Terms();
         te.Next();
         Term t = te.Term();
         _ShowTerm(t);
     }
     catch (Exception exc)
     {
         _luke.ShowStatus(exc.Message);
     }
 }
コード例 #14
0
ファイル: TestTermVectors.cs プロジェクト: yonder/mono
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" };
            System.String   test1     = "eating chocolate in a computer lab";                                             //6 terms
            System.String   test2     = "computer in a computer lab";                                                     //5 terms
            System.String   test3     = "a chocolate lab grows old";                                                      //5 terms
            System.String   test4     = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new RAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader.Terms();
                TermDocs      termDocs      = knownSearcher.reader.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]) == true)
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query query = new TermQuery(new Term("Field", "chocolate"));
                Hits  hits  = knownSearcher.Search(query);
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length() == 3);
                float score = hits.Score(0);

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString()));
                Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString()));
                Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString()));
                TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32  freqInt    = (System.Int32)test4Map[term];
                    System.Object tmpFreqInt = test4Map[term];
                    Assert.IsTrue(tmpFreqInt != null);
                    Assert.IsTrue(freqInt == freq);
                }
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
コード例 #15
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            TokenStream   ts      = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            int       corpusNumDocs            = reader.NumDocs();
            Term      internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
            Hashtable processedTerms           = new Hashtable();

            while (ts.IncrementToken())
            {
                String term = termAtt.Term();
                if (!processedTerms.Contains(term))
                {
                    processedTerms.Add(term, term);
                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                    float          minScore  = 0;
                    Term           startTerm = internSavingTemplateTerm.CreateTerm(term);
                    FuzzyTermEnum  fe        = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
                    TermEnum       origEnum  = reader.Terms(startTerm);
                    int            df        = 0;
                    if (startTerm.Equals(origEnum.Term()))
                    {
                        df = origEnum.DocFreq(); //store the df so all variants use same idf
                    }
                    int numVariants          = 0;
                    int totalVariantDocFreqs = 0;
                    do
                    {
                        Term possibleMatch = fe.Term();
                        if (possibleMatch != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = fe.Difference();
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
                                variantsQ.Insert(st);
                                minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
                            }
                        }
                    }while (fe.Next());
                    if (numVariants > 0)
                    {
                        int avgDf = totalVariantDocFreqs / numVariants;
                        if (df == 0)    //no direct match we can use as df for all variants
                        {
                            df = avgDf; //use avg df of all variants
                        }

                        // take the top variants (scored by edit distance) and reset the score
                        // to include an IDF factor then add to the global queue for ranking
                        // overall top query terms
                        int size = variantsQ.Size();
                        for (int i = 0; i < size; i++)
                        {
                            ScoreTerm st = (ScoreTerm)variantsQ.Pop();
                            st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
                            q.Insert(st);
                        }
                    }
                }
            }
        }
コード例 #16
0
        void BeginAsyncReconstruction(int docNum, Document document, Hashtable doc)
        {
            // get stored fields
            ArrayList sf = new ArrayList();

            for (int i = 0; i < _indexFields.Length; i++)
            {
                Field[] f = document.GetFields(_indexFields[i]);
                if (f == null || f.Length == 0 || !f[0].IsStored())
                {
                    continue;
                }
                StringBuilder sb = new StringBuilder();
                for (int k = 0; k < f.Length; k++)
                {
                    if (k > 0)
                    {
                        sb.Append('\n');
                    }
                    sb.Append(f[k].StringValue());
                }
                Field field = Legacy.CreateField(_indexFields[i], sb.ToString(), f[0].IsStored(), f[0].IsIndexed(), f[0].IsTokenized(), f[0].IsTermVectorStored());
                field.SetBoost(f[0].GetBoost());
                doc[_indexFields[i]] = field;
                sf.Add(_indexFields[i]);
            }
            String term = null;
            GrowableStringArray terms = null;

            try
            {
                int i     = 0;
                int delta = (int)Math.Ceiling(((double)_numTerms / 100));

                TermEnum      te = _luke.IndexReader.Terms();
                TermPositions tp = _luke.IndexReader.TermPositions();
                while (te.Next())
                {
                    if ((i++ % delta) == 0)
                    {
                        // update UI - async
                        UpdateProgress(i / delta);
                    }

                    // skip stored fields
                    if (sf.Contains(te.Term().Field()))
                    {
                        continue;
                    }
                    tp.Seek(te.Term());
                    if (!tp.SkipTo(docNum) || tp.Doc() != docNum)
                    {
                        // this term is not found in the doc
                        continue;
                    }
                    term  = te.Term().Text();
                    terms = (GrowableStringArray)doc[te.Term().Field()];
                    if (terms == null)
                    {
                        terms = new GrowableStringArray();
                        doc[te.Term().Field()] = terms;
                    }
                    for (int k = 0; k < tp.Freq(); k++)
                    {
                        int pos = tp.NextPosition();
                        terms.Set(pos, term);
                    }
                }
            }
            catch (Exception exc)
            {
                // Update UI - async
                _luke.ShowStatus(exc.Message);
            }
        }
        public Term Term()
        {
            Term t = termEnum.Term();

            return(t != null && t.Field() == fieldName ? t : null);
        }
コード例 #18
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String test1 = "eating chocolate in a computer lab";                                             //6 terms
            System.String test2 = "computer in a computer lab";                                                     //5 terms
            System.String test3 = "a chocolate lab grows old";                                                      //5 terms
            System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new MockRAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader_ForNUnit.Terms();
                TermDocs      termDocs      = knownSearcher.reader_ForNUnit.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]))
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query      query = new TermQuery(new Term("field", "chocolate"));
                ScoreDoc[] hits  = knownSearcher.Search(query, null, 1000).scoreDocs;
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length == 3);
                float score = hits[0].score;

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(hits[0].doc == 2);
                Assert.IsTrue(hits[1].doc == 3);
                Assert.IsTrue(hits[2].doc == 0);
                TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32 freqInt = -1;
                    try
                    {
                        freqInt = (System.Int32)test4Map[term];
                    }
                    catch (Exception)
                    {
                        Assert.IsTrue(false);
                    }
                    Assert.IsTrue(freqInt == freq);
                }
                SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper);
                System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet();
                Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
                TermVectorEntry last = null;
                foreach (TermVectorEntry tve in vectorEntrySet.Keys)
                {
                    if (tve != null && last != null)
                    {
                        Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted");
                        System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()];
                        //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                        Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:");
                    }
                    last = tve;
                }

                FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper);
                System.Collections.IDictionary map = fieldMapper.GetFieldToTerms();
                Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
                vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"];
                Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
                Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
コード例 #19
0
ファイル: DocumentCrawler.cs プロジェクト: AnantLabs/owl-klb
        public void End(bool shouldClose)
        {
            if (!_is_started)
            {
                return;
            }
            if (!shouldClose)
            {
                return;
            }
            //build 2del file list
            if (!_job_status.Cancelled)
            {
                TermEnum term_enum = _index_reader.Terms();
                Term     path_term = new Term("path");
                int      nb_terms  = 0;
                while (term_enum.SkipTo(path_term))                 //skip to new term equal or *ABOVE* "path:" !!!
                {
                    Term term = term_enum.Term();
                    if (term.Field() != path_term.Field())
                    {
                        break;
                    }
                    if (!File.Exists(term.Text()))
                    {
                        _del_file_list.Add(term.Text());
                    }
                    if (_job_status.Cancelled)
                    {
                        break;
                    }
                    nb_terms++;
                }
                term_enum.Close();
                Logger.Log.Info("update: deletion: {0} analyzed terms, found {1} vanished files.", nb_terms, _del_file_list.Count);
            }
            _index_searcher.Close();
            _index_reader.Close();
            //--- deleting deprecated
            if ((_del_file_list.Count > 0) && (!_job_status.Cancelled))
            {
                Stopwatch watch = new Stopwatch();
                watch.Start();

                int         num_file = 0;
                int         nb_files = _del_file_list.Count;
                IndexWriter writer   = new IndexWriter(_index_path, _default_analyzer, false);

                foreach (string path in _del_file_list)
                {
                    if (((num_file++) % 101) == 1)
                    {
                        int progress = ((((num_file++) + 1)) * 100) / nb_files;
                        _job_status.Progress    = progress;
                        _job_status.Description = String.Format("upd: removing (from index) file {0}/{1} - {2}", num_file, _del_file_list.Count,
                                                                StringFu.TimeSpanToString(new TimeSpan((long)(watch.ElapsedMilliseconds) * 10000)));
                    }
                    if (_job_status.Cancelled)
                    {
                        break;
                    }
                    writer.DeleteDocuments(new Term("path", path));
                }
                writer.Commit();
                writer.Close();
                watch.Stop();
            }
            //adding new files
            if ((_add_file_list.Count > 0) && (!_job_status.Cancelled))
            {
                Stopwatch watch = new Stopwatch();
                watch.Start();

                IndexWriter writer = null;
                try
                {
                    writer = new IndexWriter(_index_path, _default_analyzer, false, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
                    int num_file = 0;
                    int nb_files = _add_file_list.Count;
                    foreach (BasicFileInfo fi in _add_file_list)
                    {
                        if (((num_file++) % 101) == 1)
                        {
                            int progress = ((((num_file++) + 1)) * 100) / nb_files;
                            _job_status.Progress    = progress;
                            _job_status.Description = String.Format("upd: indexing new file {0}/{1} - {2}", num_file, _add_file_list.Count,
                                                                    StringFu.TimeSpanToString(new TimeSpan((long)(watch.ElapsedMilliseconds) * 10000)));
                        }
                        if (_job_status.Cancelled)
                        {
                            break;
                        }

                        writer.AddDocument(_doc_factory.CreateFromPath(fi.FilePath, fi.LastModification));
                        if (num_file % 20 == 0)
                        {
                            writer.Commit();
                        }
                    }
                    writer.Commit();
                }
                catch (System.Exception ex)
                {
                    Log.Error(ex);
                }
                finally
                {
                    if (writer != null)
                    {
                        writer.Close();
                        writer = null;
                    }
                }
                watch.Stop();
            }
            //updating modified files
            if ((_upd_file_list.Count > 0) && (!_job_status.Cancelled))
            {
                Stopwatch watch = new Stopwatch();
                watch.Start();

                int         num_file = 0;
                int         nb_files = _upd_file_list.Count;
                IndexWriter writer   = null;
                try
                {
                    writer = new IndexWriter(_index_path, _default_analyzer, false,
                                             new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));

                    foreach (BasicFileInfo fi in _upd_file_list)
                    {
                        if (((num_file++) % 101) == 1)
                        {
                            int progress = ((((num_file++) + 1)) * 100) / nb_files;
                            _job_status.Progress    = progress;
                            _job_status.Description = String.Format("upd: modified file {0}/{1} - {2}", num_file, _upd_file_list.Count,
                                                                    StringFu.TimeSpanToString(new TimeSpan((long)(watch.ElapsedMilliseconds) * 10000)));
                        }
                        if (_job_status.Cancelled)
                        {
                            break;
                        }
                        writer.UpdateDocument(new Term("path", fi.FilePath),
                                              _doc_factory.CreateFromPath(fi.FilePath, fi.LastModification));
                    }
                    writer.Commit();
                    //LittleBeagle.Properties.Settings.Default.NbIndexedFiles = num_file;
                }
                catch (System.Exception ex)
                {
                    Log.Error(ex);
                }
                finally
                {
                    if (writer != null)
                    {
                        writer.Close();
                        writer = null;
                    }
                }
                watch.Stop();
            }
        }
コード例 #20
0
        public static TermInfo[] GetHighFreqTerms(Directory dir,
                                                  Hashtable junkWords,
                                                  int numTerms,
                                                  String[] fields)
        {
            if (dir == null || fields == null)
            {
                return(new TermInfo[0]);
            }

            IndexReader   reader = IndexReader.Open(dir, true);
            TermInfoQueue tiq    = new TermInfoQueue(numTerms);
            TermEnum      terms  = reader.Terms();

            int minFreq = 0;

            while (terms.Next())
            {
                String field = terms.Term().Field();

                if (fields != null && fields.Length > 0)
                {
                    bool skip = true;

                    for (int i = 0; i < fields.Length; i++)
                    {
                        if (field.Equals(fields[i]))
                        {
                            skip = false;
                            break;
                        }
                    }
                    if (skip)
                    {
                        continue;
                    }
                }

                if (junkWords != null && junkWords[terms.Term().Text()] != null)
                {
                    continue;
                }

                if (terms.DocFreq() > minFreq)
                {
                    TermInfo top = (TermInfo)tiq.Add(new TermInfo(terms.Term(), terms.DocFreq()));
                    if (tiq.Size() >= numTerms)                        // if tiq overfull
                    {
                        tiq.Pop();                                     // remove lowest in tiq
                        minFreq = top.DocFreq;                         // reset minFreq
                    }
                }
            }

            TermInfo[] res = new TermInfo[tiq.Size()];

            for (int i = 0; i < res.Length; i++)
            {
                res[res.Length - i - 1] = (TermInfo)tiq.Pop();
            }

            reader.Close();

            return(res);
        }
コード例 #21
0
        // There are two ways we can determine the max_results
        // most recent items:
        //
        // One is to instantiate Lucene documents for each of
        // the document IDs in primary_matches.  This is a
        // fairly expensive operation.
        //
        // The other is to walk through the list of all
        // document IDs in descending time order.  This is
        // a less expensive operation, but adds up over time
        // on large data sets.
        //
        // We can walk about 2.5 docs for every Document we
        // instantiate.  So what we'll do, if we have more
        // matches than available hits, is walk (m * 1.25)
        // docs to see if we can fill out the top 100 hits.
        // If not, we'll fall back to creating documents
        // for all of them.

        private static ArrayList ScanRecentDocs(IndexReader primary_reader,
                                                IndexReader secondary_reader,
                                                BetterBitArray primary_matches,
                                                Dictionary <int, Hit> hits_by_id,
                                                int max_results,
                                                ref int total_number_of_matches,
                                                HitFilter hit_filter,
                                                string index_name)
        {
            Stopwatch a = new Stopwatch();

            a.Start();

            TermDocs  docs               = primary_reader.TermDocs();
            TermEnum  enumerator         = primary_reader.Terms(new Term("InvertedTimestamp", String.Empty));
            ArrayList results            = new ArrayList(max_results);
            int       docs_found         = 0;
            int       docs_walked        = 0;
            int       hit_filter_removed = 0;
            int       max_docs           = (int)(primary_matches.TrueCount * 1.25);

            Term     term;
            TermDocs secondary_term_docs = null;

            if (secondary_reader != null)
            {
                secondary_term_docs = secondary_reader.TermDocs();
            }

            do
            {
                term = enumerator.Term();

                if (term.Field() != "InvertedTimestamp")
                {
                    break;
                }

                docs.Seek(enumerator);

                while (docs.Next() &&
                       docs_found < max_results &&
                       docs_walked < max_docs)
                {
                    int doc_id = docs.Doc();

                    if (primary_matches.Get(doc_id))
                    {
                        Document doc = primary_reader.Document(doc_id);
                        Hit      hit = CreateHit(doc, secondary_reader, secondary_term_docs);

                        // If we have a HitFilter, apply it.
                        if (hit_filter != null && !hit_filter(hit))
                        {
                            if (Debug)
                            {
                                Log.Debug("Filtered out {0}", hit.Uri);
                            }
                            hit_filter_removed++;
                            continue;
                        }
                        hits_by_id [doc_id] = hit;
                        // Add the result, last modified first
                        results.Add(hit);
                        docs_found++;
                    }

                    docs_walked++;
                }
            } while (enumerator.Next() &&
                     docs_found < max_results &&
                     docs_walked < max_docs);

            docs.Close();
            if (secondary_term_docs != null)
            {
                secondary_term_docs.Close();
            }

            // If we've found all the docs we can return in a subset!
            // Fantastic, we've probably short circuited a slow search.
            if (docs_found != max_results)
            {
                // Otherwise bad luck! Not all docs found
                // Start afresh - this time traversing all results
                results = null;
            }
            else
            {
                // Adjust total_number_of_matches. We need to do this to avoid scenarios like the following:
                // max_hits = 100. Matched 100 results. But hit filter removed 30. So 70 results will be returned.
                // We want to avoid saying "Showing top 70 of 100". Note that since we are not passing
                // every document in the index through the hit_filter, when we say "Showing top 100 of 1234", the
                // 1234 could actually be much less. But since max_hits was 100, that will not mislead the user.
                total_number_of_matches -= hit_filter_removed;
            }

            a.Stop();
            if (Debug)
            {
                Log.Debug(">>> {0}: Walked {1} items, populated an enum with {2} items in {3}", index_name, docs_walked, docs_found, a);

                if (docs_found == max_results)
                {
                    Log.Debug(">>> {0}: Successfully short circuited timestamp ordering!", index_name);
                }
            }

            return(results);
        }