Exemple #1
0
 public virtual void  TestKnownSetOfDocuments()
 {
     System.String test1 = "eating chocolate in a computer lab"; //6 terms
     System.String test2 = "computer in a computer lab"; //5 terms
     System.String test3 = "a chocolate lab grows old"; //5 terms
     System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
     System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
     test4Map["chocolate"] = 3;
     test4Map["lab"] = 2;
     test4Map["eating"] = 1;
     test4Map["computer"] = 1;
     test4Map["with"] = 1;
     test4Map["a"] = 1;
     test4Map["colored"] = 1;
     test4Map["in"] = 1;
     test4Map["an"] = 1;
     test4Map["computer"] = 1;
     test4Map["old"] = 1;
     
     Document testDoc1 = new Document();
     SetupDoc(testDoc1, test1);
     Document testDoc2 = new Document();
     SetupDoc(testDoc2, test2);
     Document testDoc3 = new Document();
     SetupDoc(testDoc3, test3);
     Document testDoc4 = new Document();
     SetupDoc(testDoc4, test4);
     
     Directory dir = new MockRAMDirectory();
     
     try
     {
         IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
         Assert.IsTrue(writer != null);
         writer.AddDocument(testDoc1);
         writer.AddDocument(testDoc2);
         writer.AddDocument(testDoc3);
         writer.AddDocument(testDoc4);
         writer.Close();
         IndexSearcher knownSearcher = new IndexSearcher(dir, true);
         TermEnum termEnum = knownSearcher.reader_ForNUnit.Terms();
         TermDocs termDocs = knownSearcher.reader_ForNUnit.TermDocs();
         //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
         
         Similarity sim = knownSearcher.Similarity;
         while (termEnum.Next() == true)
         {
             Term term = termEnum.Term;
             //System.out.println("Term: " + term);
             termDocs.Seek(term);
             while (termDocs.Next())
             {
                 int docId = termDocs.Doc;
                 int freq = termDocs.Freq;
                 //System.out.println("Doc Id: " + docId + " freq " + freq);
                 ITermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                 float tf = sim.Tf(freq);
                 float idf = sim.Idf(knownSearcher.DocFreq(term), knownSearcher.MaxDoc);
                 //float qNorm = sim.queryNorm()
                 //This is fine since we don't have stop words
                 float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                 //float coord = sim.coord()
                 //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                 Assert.IsTrue(vector != null);
                 System.String[] vTerms = vector.GetTerms();
                 int[] freqs = vector.GetTermFrequencies();
                 for (int i = 0; i < vTerms.Length; i++)
                 {
                     if (term.Text.Equals(vTerms[i]))
                     {
                         Assert.IsTrue(freqs[i] == freq);
                     }
                 }
             }
             //System.out.println("--------");
         }
         Query query = new TermQuery(new Term("field", "chocolate"));
         ScoreDoc[] hits = knownSearcher.Search(query, null, 1000).ScoreDocs;
         //doc 3 should be the first hit b/c it is the shortest match
         Assert.IsTrue(hits.Length == 3);
         float score = hits[0].Score;
         /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
         System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
         System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
         System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
         System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
         System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
         Assert.IsTrue(hits[0].Doc == 2);
         Assert.IsTrue(hits[1].Doc == 3);
         Assert.IsTrue(hits[2].Doc == 0);
         ITermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, "field");
         Assert.IsTrue(vector2 != null);
         //System.out.println("Vector: " + vector);
         System.String[] terms = vector2.GetTerms();
         int[] freqs2 = vector2.GetTermFrequencies();
         Assert.IsTrue(terms != null && terms.Length == 10);
         for (int i = 0; i < terms.Length; i++)
         {
             System.String term = terms[i];
             //System.out.println("Term: " + term);
             int freq = freqs2[i];
             Assert.IsTrue(test4.IndexOf(term) != - 1);
             System.Int32 freqInt = -1;
             try
             {
                 freqInt = (System.Int32) test4Map[term];
             }
             catch (Exception)
             {
                 Assert.IsTrue(false);
             }
             Assert.IsTrue(freqInt == freq);
         }
         SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
         knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, mapper);
         var vectorEntrySet = mapper.TermVectorEntrySet;
         Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
         TermVectorEntry last = null;
         foreach(TermVectorEntry tve in vectorEntrySet)
         {
             if (tve != null && last != null)
             {
                 Assert.IsTrue(last.Frequency >= tve.Frequency, "terms are not properly sorted");
                 System.Int32 expectedFreq = (System.Int32) test4Map[tve.Term];
                 //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                 Assert.IsTrue(tve.Frequency == 2 * expectedFreq, "Frequency is not correct:");
             }
             last = tve;
         }
         
         FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
         knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, fieldMapper);
         var map = fieldMapper.FieldToTerms;
         Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
         vectorEntrySet = map["field"];
         Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
         Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
         knownSearcher.Close();
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
 }
		public virtual void  TestMapper()
		{
			TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
			Assert.IsTrue(reader != null);
			SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
			reader.Get(0, mapper);
			SortedSet<TermVectorEntry> set_Renamed = mapper.GetTermVectorEntrySet();
			Assert.IsTrue(set_Renamed != null, "set is null and it shouldn't be");
			//three fields, 4 terms, all terms are the same
			Assert.IsTrue(set_Renamed.Count == 4, "set Size: " + set_Renamed.Count + " is not: " + 4);
			//Check offsets and positions
            foreach(TermVectorEntry tve in set_Renamed)
            {			
				Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
				Assert.IsTrue(tve.GetOffsets() != null, "tve.getOffsets() is null and it shouldn't be");
				Assert.IsTrue(tve.GetPositions() != null, "tve.getPositions() is null and it shouldn't be");
			}
			
			mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
			reader.Get(1, mapper);
			set_Renamed = mapper.GetTermVectorEntrySet();
			Assert.IsTrue(set_Renamed != null, "set is null and it shouldn't be");
			//three fields, 4 terms, all terms are the same
			Assert.IsTrue(set_Renamed.Count == 4, "set Size: " + set_Renamed.Count + " is not: " + 4);
			//Should have offsets and positions b/c we are munging all the fields together
            foreach(TermVectorEntry tve in set_Renamed)
			{
				Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
				Assert.IsTrue(tve.GetOffsets() != null, "tve.getOffsets() is null and it shouldn't be");
				Assert.IsTrue(tve.GetPositions() != null, "tve.getPositions() is null and it shouldn't be");
			}
			
			
			FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
			reader.Get(0, fsMapper);
            IDictionary<string, SortedSet<TermVectorEntry>> map = fsMapper.GetFieldToTerms();
			Assert.IsTrue(map.Count == testFields.Length, "map Size: " + map.Count + " is not: " + testFields.Length);
            foreach(KeyValuePair<string,SortedSet<TermVectorEntry>> entry in new Dictionary<string, SortedSet<TermVectorEntry>>(map))
			{
				SortedSet<TermVectorEntry> sortedSet = entry.Value;
				Assert.IsTrue(sortedSet.Count == 4, "sortedSet Size: " + sortedSet.Count + " is not: " + 4);
                foreach(TermVectorEntry tve in sortedSet)
				{
					Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
					//Check offsets and positions.
					Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
					System.String field = tve.GetField();
					if (field.Equals(testFields[0]))
					{
						//should have offsets
						
						Assert.IsTrue(tve.GetOffsets() != null, "tve.getOffsets() is null and it shouldn't be");
						Assert.IsTrue(tve.GetPositions() != null, "tve.getPositions() is null and it shouldn't be");
					}
					else if (field.Equals(testFields[1]))
					{
						//should not have offsets
						
						Assert.IsTrue(tve.GetOffsets() == null, "tve.getOffsets() is not null and it shouldn't be");
						Assert.IsTrue(tve.GetPositions() == null, "tve.getPositions() is not null and it shouldn't be");
					}
				}
			}
			//Try mapper that ignores offs and positions
			fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
			reader.Get(0, fsMapper);
			map = fsMapper.GetFieldToTerms();
			Assert.IsTrue(map.Count == testFields.Length, "map Size: " + map.Count + " is not: " + testFields.Length);
            foreach(KeyValuePair<string,SortedSet<TermVectorEntry>> entry in new Dictionary<string,SortedSet<TermVectorEntry>>(map))
			{
				SortedSet<TermVectorEntry> sortedSet = entry.Value;
				Assert.IsTrue(sortedSet.Count == 4, "sortedSet Size: " + sortedSet.Count + " is not: " + 4);
                foreach(TermVectorEntry tve in sortedSet)
				{
					Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
					//Check offsets and positions.
					Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
					System.String field = tve.GetField();
					if (field.Equals(testFields[0]))
					{
						//should have offsets
						
						Assert.IsTrue(tve.GetOffsets() == null, "tve.getOffsets() is null and it shouldn't be");
						Assert.IsTrue(tve.GetPositions() == null, "tve.getPositions() is null and it shouldn't be");
					}
					else if (field.Equals(testFields[1]))
					{
						//should not have offsets
						
						Assert.IsTrue(tve.GetOffsets() == null, "tve.getOffsets() is not null and it shouldn't be");
						Assert.IsTrue(tve.GetPositions() == null, "tve.getPositions() is not null and it shouldn't be");
					}
				}
			}
			
			// test setDocumentNumber()
			IndexReader ir = IndexReader.Open(dir);
			DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
			Assert.AreEqual(- 1, docNumAwareMapper.GetDocumentNumber());
			
			ir.GetTermFreqVector(0, docNumAwareMapper);
			Assert.AreEqual(0, docNumAwareMapper.GetDocumentNumber());
			docNumAwareMapper.SetDocumentNumber(- 1);
			
			ir.GetTermFreqVector(1, docNumAwareMapper);
			Assert.AreEqual(1, docNumAwareMapper.GetDocumentNumber());
			docNumAwareMapper.SetDocumentNumber(- 1);
			
			ir.GetTermFreqVector(0, "f1", docNumAwareMapper);
			Assert.AreEqual(0, docNumAwareMapper.GetDocumentNumber());
			docNumAwareMapper.SetDocumentNumber(- 1);
			
			ir.GetTermFreqVector(1, "f2", docNumAwareMapper);
			Assert.AreEqual(1, docNumAwareMapper.GetDocumentNumber());
			docNumAwareMapper.SetDocumentNumber(- 1);
			
			ir.GetTermFreqVector(0, "f1", docNumAwareMapper);
			Assert.AreEqual(0, docNumAwareMapper.GetDocumentNumber());
			
			ir.Close();
		}
        public virtual void  TestMapper()
        {
            TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());

            reader.Get(0, mapper);
            var set_Renamed = mapper.TermVectorEntrySet;

            Assert.IsTrue(set_Renamed != null, "set is null and it shouldn't be");
            //three fields, 4 terms, all terms are the same
            Assert.IsTrue(set_Renamed.Count == 4, "set Size: " + set_Renamed.Count + " is not: " + 4);
            //Check offsets and positions
            for (System.Collections.IEnumerator iterator = set_Renamed.GetEnumerator(); iterator.MoveNext();)
            {
                TermVectorEntry tve = (TermVectorEntry)iterator.Current;
                Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
                Assert.IsTrue(tve.GetOffsets() != null, "tve.getOffsets() is null and it shouldn't be");
                Assert.IsTrue(tve.GetPositions() != null, "tve.getPositions() is null and it shouldn't be");
            }

            mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
            reader.Get(1, mapper);
            set_Renamed = mapper.TermVectorEntrySet;
            Assert.IsTrue(set_Renamed != null, "set is null and it shouldn't be");
            //three fields, 4 terms, all terms are the same
            Assert.IsTrue(set_Renamed.Count == 4, "set Size: " + set_Renamed.Count + " is not: " + 4);
            //Should have offsets and positions b/c we are munging all the fields together
            for (System.Collections.IEnumerator iterator = set_Renamed.GetEnumerator(); iterator.MoveNext();)
            {
                TermVectorEntry tve = (TermVectorEntry)iterator.Current;
                Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
                Assert.IsTrue(tve.GetOffsets() != null, "tve.getOffsets() is null and it shouldn't be");
                Assert.IsTrue(tve.GetPositions() != null, "tve.getPositions() is null and it shouldn't be");
            }


            FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());

            reader.Get(0, fsMapper);
            var map = fsMapper.FieldToTerms;

            Assert.IsTrue(map.Count == testFields.Length, "map Size: " + map.Count + " is not: " + testFields.Length);
            for (var iterator = map.GetEnumerator(); iterator.MoveNext();)
            {
                var entry     = iterator.Current;
                var sortedSet = entry.Value;
                Assert.IsTrue(sortedSet.Count == 4, "sortedSet Size: " + sortedSet.Count + " is not: " + 4);
                for (var inner = sortedSet.GetEnumerator(); inner.MoveNext();)
                {
                    TermVectorEntry tve = inner.Current;
                    Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
                    //Check offsets and positions.
                    Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
                    System.String field = tve.Field;
                    if (field.Equals(testFields[0]))
                    {
                        //should have offsets

                        Assert.IsTrue(tve.GetOffsets() != null, "tve.getOffsets() is null and it shouldn't be");
                        Assert.IsTrue(tve.GetPositions() != null, "tve.getPositions() is null and it shouldn't be");
                    }
                    else if (field.Equals(testFields[1]))
                    {
                        //should not have offsets

                        Assert.IsTrue(tve.GetOffsets() == null, "tve.getOffsets() is not null and it shouldn't be");
                        Assert.IsTrue(tve.GetPositions() == null, "tve.getPositions() is not null and it shouldn't be");
                    }
                }
            }
            //Try mapper that ignores offs and positions
            fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
            reader.Get(0, fsMapper);
            map = fsMapper.FieldToTerms;
            Assert.IsTrue(map.Count == testFields.Length, "map Size: " + map.Count + " is not: " + testFields.Length);
            for (var iterator = map.GetEnumerator(); iterator.MoveNext();)
            {
                var entry     = iterator.Current;
                var sortedSet = entry.Value;
                Assert.IsTrue(sortedSet.Count == 4, "sortedSet Size: " + sortedSet.Count + " is not: " + 4);
                for (var inner = sortedSet.GetEnumerator(); inner.MoveNext();)
                {
                    TermVectorEntry tve = inner.Current;
                    Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
                    //Check offsets and positions.
                    Assert.IsTrue(tve != null, "tve is null and it shouldn't be");
                    System.String field = tve.Field;
                    if (field.Equals(testFields[0]))
                    {
                        //should have offsets

                        Assert.IsTrue(tve.GetOffsets() == null, "tve.getOffsets() is null and it shouldn't be");
                        Assert.IsTrue(tve.GetPositions() == null, "tve.getPositions() is null and it shouldn't be");
                    }
                    else if (field.Equals(testFields[1]))
                    {
                        //should not have offsets

                        Assert.IsTrue(tve.GetOffsets() == null, "tve.getOffsets() is not null and it shouldn't be");
                        Assert.IsTrue(tve.GetPositions() == null, "tve.getPositions() is not null and it shouldn't be");
                    }
                }
            }

            // test setDocumentNumber()
            IndexReader       ir = IndexReader.Open(dir, true);
            DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();

            Assert.AreEqual(-1, docNumAwareMapper.GetDocumentNumber());

            ir.GetTermFreqVector(0, docNumAwareMapper);
            Assert.AreEqual(0, docNumAwareMapper.GetDocumentNumber());
            docNumAwareMapper.SetDocumentNumber(-1);

            ir.GetTermFreqVector(1, docNumAwareMapper);
            Assert.AreEqual(1, docNumAwareMapper.GetDocumentNumber());
            docNumAwareMapper.SetDocumentNumber(-1);

            ir.GetTermFreqVector(0, "f1", docNumAwareMapper);
            Assert.AreEqual(0, docNumAwareMapper.GetDocumentNumber());
            docNumAwareMapper.SetDocumentNumber(-1);

            ir.GetTermFreqVector(1, "f2", docNumAwareMapper);
            Assert.AreEqual(1, docNumAwareMapper.GetDocumentNumber());
            docNumAwareMapper.SetDocumentNumber(-1);

            ir.GetTermFreqVector(0, "f1", docNumAwareMapper);
            Assert.AreEqual(0, docNumAwareMapper.GetDocumentNumber());

            ir.Close();
        }