/// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="vector">List of terms and their frequencies for a doc/field
        /// </param>
        private void AddTermFrequencies(IDictionary termFreqMap, TermFreqVector vector)
        {
            String[] terms = vector.GetTerms();
            int[]    freqs = vector.GetTermFrequencies();
            for (int j = 0; j < terms.Length; j++)
            {
                String term = terms[j];

                if (IsNoiseWord(term))
                {
                    continue;
                }
                // increment frequency
                Int cnt = (Int)termFreqMap[term];
                if (cnt == null)
                {
                    cnt = new Int();
                    termFreqMap[term] = cnt;
                    cnt.x             = freqs[j];
                }
                else
                {
                    cnt.x += freqs[j];
                }
            }
        }
Exemplo n.º 2
0
        public virtual void  TestTermVectors()
        {
            try
            {
                TermFreqVector result = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
                Assert.IsTrue(result != null);
                System.String[] terms = result.GetTerms();
                int[]           freqs = result.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 3 && freqs != null && freqs.Length == 3);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    int           freq = freqs[i];
                    Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                    Assert.IsTrue(freq > 0);
                }

                TermFreqVector[] results = reader.GetTermFreqVectors(0);
                Assert.IsTrue(results != null);
                Assert.IsTrue(results.Length == 2);
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
Exemplo n.º 3
0
        public TermVector(string fieldName, TermFreqVector tfv)
        {
            //
            // Required for Windows Form Designer support
            //
            InitializeComponent();

            lblField.Text = fieldName;
            IntPair[] tvs   = new IntPair[tfv.Size()];
            String[]  terms = tfv.GetTerms();
            int[]     freqs = tfv.GetTermFrequencies();
            for (int i = 0; i < terms.Length; i++)
            {
                IntPair ip = new IntPair(freqs[i], terms[i]);
                tvs[i] = ip;
            }
            Array.Sort(tvs, new IntPair.PairComparator(false, true));

            listViewTVF.BeginUpdate();

            for (int i = 0; i < tvs.Length; i++)
            {
                ListViewItem item = new ListViewItem(
                    new string[] { tvs[i].cnt.ToString(), tvs[i].text });
                listViewTVF.Items.Add(item);
            }

            listViewTVF.EndUpdate();
        }
Exemplo n.º 4
0
        public TermVector(string fieldName, TermFreqVector tfv)
        {
            //
            // Required for Windows Form Designer support
            //
            InitializeComponent();

            lblField.Text = fieldName;

            List<TermFrequency> tvs = new List<TermFrequency>(tfv.Size());

            String[] terms = tfv.GetTerms();
            int[] freqs = tfv.GetTermFrequencies();
            for (int i = 0; i < terms.Length; i++)
            {
                tvs.Add(new TermFrequency(terms[i], freqs[i]));
            }

            tvs.OrderBy( p => p.Term);

            listViewTVF.BeginUpdate();

            foreach(TermFrequency tf in tvs) {
                ListViewItem item = new ListViewItem(new string[]{tf.Frequency.ToString(), tf.Term});
                listViewTVF.Items.Add(item);
            }

            listViewTVF.EndUpdate();
        }
Exemplo n.º 5
0
        public TermVector(string fieldName, TermFreqVector tfv)
        {
            //
            // Required for Windows Form Designer support
            //
            InitializeComponent();

            lblField.Text = fieldName;

            List <TermFrequency> tvs = new List <TermFrequency>(tfv.Size());

            String[] terms = tfv.GetTerms();
            int[]    freqs = tfv.GetTermFrequencies();
            for (int i = 0; i < terms.Length; i++)
            {
                tvs.Add(new TermFrequency(terms[i], freqs[i]));
            }

            tvs.OrderBy(p => p.Term);

            listViewTVF.BeginUpdate();

            foreach (TermFrequency tf in tvs)
            {
                ListViewItem item = new ListViewItem(new string[] { tf.Frequency.ToString(), tf.Term });
                listViewTVF.Items.Add(item);
            }

            listViewTVF.EndUpdate();
        }
Exemplo n.º 6
0
        public virtual void  TestMerge()
        {
            //System.out.println("----------------TestMerge------------------");
            SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment, false);

            merger.Add(reader1);
            merger.Add(reader2);
            try
            {
                int docsMerged = merger.Merge();
                merger.CloseReaders();
                Assert.IsTrue(docsMerged == 2);
                //Should be able to open a new SegmentReader against the new directory
                SegmentReader mergedReader = new SegmentReader(new SegmentInfo(mergedSegment, docsMerged, mergedDir));
                Assert.IsTrue(mergedReader != null);
                Assert.IsTrue(mergedReader.NumDocs() == 2);
                Document newDoc1 = mergedReader.Document(0);
                Assert.IsTrue(newDoc1 != null);
                //There are 2 unstored fields on the document
                Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - 2);
                Document newDoc2 = mergedReader.Document(1);
                Assert.IsTrue(newDoc2 != null);
                Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - 2);

                TermDocs termDocs = mergedReader.TermDocs(new Term(DocHelper.TEXT_FIELD_2_KEY, "Field"));
                Assert.IsTrue(termDocs != null);
                Assert.IsTrue(termDocs.Next() == true);

                System.Collections.ICollection stored = mergedReader.GetIndexedFieldNames(true);
                Assert.IsTrue(stored != null);
                //System.out.println("stored size: " + stored.size());
                Assert.IsTrue(stored.Count == 2);

                TermFreqVector vector = mergedReader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
                Assert.IsTrue(vector != null);
                System.String[] terms = vector.GetTerms();
                Assert.IsTrue(terms != null);
                //System.out.println("Terms size: " + terms.length);
                Assert.IsTrue(terms.Length == 3);
                int[] freqs = vector.GetTermFrequencies();
                Assert.IsTrue(freqs != null);
                //System.out.println("Freqs size: " + freqs.length);

                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    int           freq = freqs[i];
                    //System.out.println("Term: " + term + " Freq: " + freq);
                    Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                    Assert.IsTrue(DocHelper.FIELD_2_FREQS[i] == freq);
                }
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
            //System.out.println("---------------------end TestMerge-------------------");
        }
Exemplo n.º 7
0
        public virtual void  TestMerge()
        {
            SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment);

            merger.Add(reader1);
            merger.Add(reader2);
            int docsMerged = merger.Merge();

            merger.CloseReaders();
            Assert.IsTrue(docsMerged == 2);
            //Should be able to open a new SegmentReader against the new directory
            SegmentReader mergedReader = SegmentReader.Get(new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true));

            Assert.IsTrue(mergedReader != null);
            Assert.IsTrue(mergedReader.NumDocs() == 2);
            Document newDoc1 = mergedReader.Document(0);

            Assert.IsTrue(newDoc1 != null);
            //There are 2 unstored fields on the document
            Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count);
            Document newDoc2 = mergedReader.Document(1);

            Assert.IsTrue(newDoc2 != null);
            Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count);

            TermDocs termDocs = mergedReader.TermDocs(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"));

            Assert.IsTrue(termDocs != null);
            Assert.IsTrue(termDocs.Next() == true);

            System.Collections.Generic.ICollection <string> stored = mergedReader.GetFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR);
            Assert.IsTrue(stored != null);
            //System.out.println("stored size: " + stored.size());
            Assert.IsTrue(stored.Count == 4, "We do not have 4 fields that were indexed with term vector");

            TermFreqVector vector = mergedReader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);

            Assert.IsTrue(vector != null);
            System.String[] terms = vector.GetTerms();
            Assert.IsTrue(terms != null);
            //System.out.println("Terms size: " + terms.length);
            Assert.IsTrue(terms.Length == 3);
            int[] freqs = vector.GetTermFrequencies();
            Assert.IsTrue(freqs != null);
            //System.out.println("Freqs size: " + freqs.length);
            Assert.IsTrue(vector is TermPositionVector == true);

            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                int           freq = freqs[i];
                //System.out.println("Term: " + term + " Freq: " + freq);
                Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                Assert.IsTrue(DocHelper.FIELD_2_FREQS[i] == freq);
            }

            TestSegmentReader.CheckNorms(mergedReader);
        }
 private void  AddTermFreqVectorInternal(TermFreqVector vector)
 {
     OpenField(vector.GetField());
     for (int i = 0; i < vector.Size(); i++)
     {
         AddTermInternal(vector.GetTerms()[i], vector.GetTermFrequencies()[i]);
     }
     CloseField();
 }
Exemplo n.º 9
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        ///
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void  AddAllDocVectors(TermFreqVector[] vectors)
        {
            OpenDocument();

            if (vectors != null)
            {
                for (int i = 0; i < vectors.Length; i++)
                {
                    bool storePositionWithTermVector = false;
                    bool storeOffsetWithTermVector   = false;

                    try
                    {
                        TermPositionVector tpVector = (TermPositionVector)vectors[i];

                        if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null)
                        {
                            storePositionWithTermVector = true;
                        }
                        if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null)
                        {
                            storeOffsetWithTermVector = true;
                        }

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tpVector.Size(); j++)
                        {
                            AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j));
                        }

                        CloseField();
                    }
                    catch (System.InvalidCastException ignore)
                    {
                        TermFreqVector tfVector = vectors[i];

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tfVector.Size(); j++)
                        {
                            AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null);
                        }

                        CloseField();
                    }
                }
            }

            CloseDocument();
        }
        /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
        /// <throws>  IOException </throws>
        private void  MergeVectors()
        {
            TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);

            try
            {
                for (int r = 0; r < readers.Count; r++)
                {
                    Monodoc.Lucene.Net.Index.IndexReader reader = (Monodoc.Lucene.Net.Index.IndexReader)readers[r];
                    int maxDoc = reader.MaxDoc();
                    for (int docNum = 0; docNum < maxDoc; docNum++)
                    {
                        // skip deleted docs
                        if (reader.IsDeleted(docNum))
                        {
                            continue;
                        }
                        termVectorsWriter.OpenDocument();

                        // get all term vectors
                        TermFreqVector[] sourceTermVector = reader.GetTermFreqVectors(docNum);

                        if (sourceTermVector != null)
                        {
                            for (int f = 0; f < sourceTermVector.Length; f++)
                            {
                                // translate Field numbers
                                TermFreqVector termVector = sourceTermVector[f];
                                termVectorsWriter.OpenField(termVector.GetField());
                                System.String[] terms = termVector.GetTerms();
                                int[]           freqs = termVector.GetTermFrequencies();

                                for (int t = 0; t < terms.Length; t++)
                                {
                                    termVectorsWriter.AddTerm(terms[t], freqs[t]);
                                }
                            }
                            termVectorsWriter.CloseDocument();
                        }
                    }
                }
            }
            finally
            {
                termVectorsWriter.Close();
            }
        }
Exemplo n.º 11
0
        public virtual void  TestTermVectors()
        {
            TermFreqVector result = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);

            Assert.IsTrue(result != null);
            System.String[] terms = result.GetTerms();
            int[]           freqs = result.GetTermFrequencies();
            Assert.IsTrue(terms != null && terms.Length == 3 && freqs != null && freqs.Length == 3);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                int           freq = freqs[i];
                Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                Assert.IsTrue(freq > 0);
            }

            TermFreqVector[] results = reader.GetTermFreqVectors(0);
            Assert.IsTrue(results != null);
            Assert.IsTrue(results.Length == 4, "We do not have 4 term freq vectors, we have: " + results.Length);
        }
Exemplo n.º 12
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" };
            System.String   test1     = "eating chocolate in a computer lab";                                             //6 terms
            System.String   test2     = "computer in a computer lab";                                                     //5 terms
            System.String   test3     = "a chocolate lab grows old";                                                      //5 terms
            System.String   test4     = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new RAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader.Terms();
                TermDocs      termDocs      = knownSearcher.reader.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]) == true)
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query query = new TermQuery(new Term("Field", "chocolate"));
                Hits  hits  = knownSearcher.Search(query);
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length() == 3);
                float score = hits.Score(0);

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString()));
                Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString()));
                Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString()));
                TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32  freqInt    = (System.Int32)test4Map[term];
                    System.Object tmpFreqInt = test4Map[term];
                    Assert.IsTrue(tmpFreqInt != null);
                    Assert.IsTrue(freqInt == freq);
                }
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
Exemplo n.º 13
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String test1 = "eating chocolate in a computer lab";                                             //6 terms
            System.String test2 = "computer in a computer lab";                                                     //5 terms
            System.String test3 = "a chocolate lab grows old";                                                      //5 terms
            System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new MockRAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader_ForNUnit.Terms();
                TermDocs      termDocs      = knownSearcher.reader_ForNUnit.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]))
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query      query = new TermQuery(new Term("field", "chocolate"));
                ScoreDoc[] hits  = knownSearcher.Search(query, null, 1000).scoreDocs;
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length == 3);
                float score = hits[0].score;

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(hits[0].doc == 2);
                Assert.IsTrue(hits[1].doc == 3);
                Assert.IsTrue(hits[2].doc == 0);
                TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32 freqInt = -1;
                    try
                    {
                        freqInt = (System.Int32)test4Map[term];
                    }
                    catch (Exception)
                    {
                        Assert.IsTrue(false);
                    }
                    Assert.IsTrue(freqInt == freq);
                }
                SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper);
                System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet();
                Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
                TermVectorEntry last = null;
                foreach (TermVectorEntry tve in vectorEntrySet.Keys)
                {
                    if (tve != null && last != null)
                    {
                        Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted");
                        System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()];
                        //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                        Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:");
                    }
                    last = tve;
                }

                FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper);
                System.Collections.IDictionary map = fieldMapper.GetFieldToTerms();
                Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
                vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"];
                Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
                Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
Exemplo n.º 14
0
        public static void  VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2)
        {
            if (d1 == null)
            {
                Assert.IsTrue(d2 == null);
                return;
            }
            Assert.IsTrue(d2 != null);

            Assert.AreEqual(d1.Length, d2.Length);
            for (int i = 0; i < d1.Length; i++)
            {
                TermFreqVector v1 = d1[i];
                TermFreqVector v2 = d2[i];
                if (v1 == null || v2 == null)
                {
                    System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length);
                }
                Assert.AreEqual(v1.Size(), v2.Size());
                int             numTerms = v1.Size();
                System.String[] terms1   = v1.GetTerms();
                System.String[] terms2   = v2.GetTerms();
                int[]           freq1    = v1.GetTermFrequencies();
                int[]           freq2    = v2.GetTermFrequencies();
                for (int j = 0; j < numTerms; j++)
                {
                    if (!terms1[j].Equals(terms2[j]))
                    {
                        Assert.AreEqual(terms1[j], terms2[j]);
                    }
                    Assert.AreEqual(freq1[j], freq2[j]);
                }
                if (v1 is TermPositionVector)
                {
                    Assert.IsTrue(v2 is TermPositionVector);
                    TermPositionVector tpv1 = (TermPositionVector)v1;
                    TermPositionVector tpv2 = (TermPositionVector)v2;
                    for (int j = 0; j < numTerms; j++)
                    {
                        int[] pos1 = tpv1.GetTermPositions(j);
                        int[] pos2 = tpv2.GetTermPositions(j);
                        Assert.AreEqual(pos1.Length, pos2.Length);
                        TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j);
                        TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j);
                        if (offsets1 == null)
                        {
                            Assert.IsTrue(offsets2 == null);
                        }
                        else
                        {
                            Assert.IsTrue(offsets2 != null);
                        }
                        for (int k = 0; k < pos1.Length; k++)
                        {
                            Assert.AreEqual(pos1[k], pos2[k]);
                            if (offsets1 != null)
                            {
                                Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset());
                                Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset());
                            }
                        }
                    }
                }
            }
        }
Exemplo n.º 15
0
		private void  AddTermFreqVectorInternal(TermFreqVector vector)
		{
			OpenField(vector.GetField());
			for (int i = 0; i < vector.Size(); i++)
			{
				AddTermInternal(vector.GetTerms()[i], vector.GetTermFrequencies()[i]);
			}
			CloseField();
		}