DocFreq() 공개 메소드

public DocFreq ( Lucene.Net.Index.Term term ) : int
term Lucene.Net.Index.Term
리턴 int
예제 #1
0
        /// <summary> Check whether the word exists in the index.</summary>
        /// <param name="word">String
        /// </param>
        /// <throws>  IOException </throws>
        /// <returns> true iff the word exists in the index
        /// </returns>
        public virtual bool Exist(System.String word)
        {
            // obtainSearcher calls ensureOpen
            IndexSearcher indexSearcher = ObtainSearcher();

            try
            {
                return(indexSearcher.DocFreq(F_WORD_TERM.CreateTerm(word)) > 0);
            }
            finally
            {
                ReleaseSearcher(indexSearcher);
            }
        }
예제 #2
0
 public virtual void  TestKnownSetOfDocuments()
 {
     System.String test1 = "eating chocolate in a computer lab"; //6 terms
     System.String test2 = "computer in a computer lab"; //5 terms
     System.String test3 = "a chocolate lab grows old"; //5 terms
     System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
     System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
     test4Map["chocolate"] = 3;
     test4Map["lab"] = 2;
     test4Map["eating"] = 1;
     test4Map["computer"] = 1;
     test4Map["with"] = 1;
     test4Map["a"] = 1;
     test4Map["colored"] = 1;
     test4Map["in"] = 1;
     test4Map["an"] = 1;
     test4Map["computer"] = 1;
     test4Map["old"] = 1;
     
     Document testDoc1 = new Document();
     SetupDoc(testDoc1, test1);
     Document testDoc2 = new Document();
     SetupDoc(testDoc2, test2);
     Document testDoc3 = new Document();
     SetupDoc(testDoc3, test3);
     Document testDoc4 = new Document();
     SetupDoc(testDoc4, test4);
     
     Directory dir = new MockRAMDirectory();
     
     try
     {
         IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
         Assert.IsTrue(writer != null);
         writer.AddDocument(testDoc1);
         writer.AddDocument(testDoc2);
         writer.AddDocument(testDoc3);
         writer.AddDocument(testDoc4);
         writer.Close();
         IndexSearcher knownSearcher = new IndexSearcher(dir, true);
         TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms();
         TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs();
         //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
         
         Similarity sim = knownSearcher.Similarity;
         while (termEnum.Next() == true)
         {
             Term term = termEnum.Term;
             //System.out.println("Term: " + term);
             termDocs.Seek(term);
             while (termDocs.Next())
             {
                 int docId = termDocs.Doc;
                 int freq = termDocs.Freq;
                 //System.out.println("Doc Id: " + docId + " freq " + freq);
                 ITermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                 float tf = sim.Tf(freq);
                 float idf = sim.Idf(knownSearcher.DocFreq(term), knownSearcher.MaxDoc);
                 //float qNorm = sim.queryNorm()
                 //This is fine since we don't have stop words
                 float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                 //float coord = sim.coord()
                 //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                 Assert.IsTrue(vector != null);
                 System.String[] vTerms = vector.GetTerms();
                 int[] freqs = vector.GetTermFrequencies();
                 for (int i = 0; i < vTerms.Length; i++)
                 {
                     if (term.Text.Equals(vTerms[i]))
                     {
                         Assert.IsTrue(freqs[i] == freq);
                     }
                 }
             }
             //System.out.println("--------");
         }
         Query query = new TermQuery(new Term("field", "chocolate"));
         ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).ScoreDocs;
         //doc 3 should be the first hit b/c it is the shortest match
         Assert.IsTrue(hits.Length == 3);
         float score = hits[0].Score;
         /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
         System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
         System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
         System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
         System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
         System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
         Assert.IsTrue(hits[0].Doc == 2);
         Assert.IsTrue(hits[1].Doc == 3);
         Assert.IsTrue(hits[2].Doc == 0);
         ITermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, "field");
         Assert.IsTrue(vector2 != null);
         //System.out.println("Vector: " + vector);
         System.String[] terms = vector2.GetTerms();
         int[] freqs2 = vector2.GetTermFrequencies();
         Assert.IsTrue(terms != null && terms.Length == 10);
         for (int i = 0; i < terms.Length; i++)
         {
             System.String term = terms[i];
             //System.out.println("Term: " + term);
             int freq = freqs2[i];
             Assert.IsTrue(test4.IndexOf(term) != - 1);
             System.Int32 freqInt = -1;
             try
             {
                 freqInt = (System.Int32) test4Map[term];
             }
             catch (Exception)
             {
                 Assert.IsTrue(false);
             }
             Assert.IsTrue(freqInt == freq);
         }
         SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
         knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, mapper);
         var vectorEntrySet = mapper.TermVectorEntrySet;
         Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
         TermVectorEntry last = null;
         foreach(TermVectorEntry tve in vectorEntrySet)
         {
             if (tve != null && last != null)
             {
                 Assert.IsTrue(last.Frequency >= tve.Frequency, "terms are not properly sorted");
                 System.Int32 expectedFreq = (System.Int32) test4Map[tve.Term];
                 //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                 Assert.IsTrue(tve.Frequency == 2 * expectedFreq, "Frequency is not correct:");
             }
             last = tve;
         }
         
         FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
         knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, fieldMapper);
         var map = fieldMapper.FieldToTerms;
         Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
         vectorEntrySet = map["field"];
         Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
         Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
         knownSearcher.Close();
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
 }
예제 #3
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String test1 = "eating chocolate in a computer lab";                                             //6 terms
            System.String test2 = "computer in a computer lab";                                                     //5 terms
            System.String test3 = "a chocolate lab grows old";                                                      //5 terms
            System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new MockRAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir, true);
                TermEnum      termEnum      = knownSearcher.reader_ForNUnit.Terms();
                TermDocs      termDocs      = knownSearcher.reader_ForNUnit.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.Similarity;
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term;
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc;
                        int freq  = termDocs.Freq;
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        ITermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                        float           tf     = sim.Tf(freq);
                        float           idf    = sim.Idf(knownSearcher.DocFreq(term), knownSearcher.MaxDoc);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text.Equals(vTerms[i]))
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query      query = new TermQuery(new Term("field", "chocolate"));
                ScoreDoc[] hits  = knownSearcher.Search(query, null, 1000).ScoreDocs;
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length == 3);
                float score = hits[0].Score;

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(hits[0].Doc == 2);
                Assert.IsTrue(hits[1].Doc == 3);
                Assert.IsTrue(hits[2].Doc == 0);
                ITermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, "field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32 freqInt = -1;
                    try
                    {
                        freqInt = (System.Int32)test4Map[term];
                    }
                    catch (Exception)
                    {
                        Assert.IsTrue(false);
                    }
                    Assert.IsTrue(freqInt == freq);
                }
                SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, mapper);
                var vectorEntrySet = mapper.TermVectorEntrySet;
                Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
                TermVectorEntry last = null;
                foreach (TermVectorEntry tve in vectorEntrySet)
                {
                    if (tve != null && last != null)
                    {
                        Assert.IsTrue(last.Frequency >= tve.Frequency, "terms are not properly sorted");
                        System.Int32 expectedFreq = (System.Int32)test4Map[tve.Term];
                        //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                        Assert.IsTrue(tve.Frequency == 2 * expectedFreq, "Frequency is not correct:");
                    }
                    last = tve;
                }

                FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, fieldMapper);
                var map = fieldMapper.FieldToTerms;
                Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
                vectorEntrySet = map["field"];
                Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
                Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }