예제 #1
0
        /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="vector">List of terms and their frequencies for a doc/field
        /// </param>
        protected void AddTermFrequencies(IDictionary <string, Int> termFreqMap, ITermFreqVector vector)
        {
            System.String[] terms = vector.GetTerms();
            int[]           freqs = vector.GetTermFrequencies();
            for (int j = 0; j < terms.Length; j++)
            {
                System.String term = terms[j];

                if (IsNoiseWord(term))
                {
                    continue;
                }
                // increment frequency
                Int cnt = termFreqMap[term];
                if (cnt == null)
                {
                    cnt = new Int();
                    termFreqMap[term] = cnt;
                    cnt.x             = freqs[j];
                }
                else
                {
                    cnt.x += freqs[j];
                }
            }
        }
        private void  TestTermVectors()
        {
            // check:
            int  numDocs = reader.NumDocs();
            long start   = 0L;

            for (int docId = 0; docId < numDocs; docId++)
            {
                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                ITermFreqVector[] vectors = reader.GetTermFreqVectors(docId, null);
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                // verify vectors result
                VerifyVectors(vectors, docId);

                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                ITermFreqVector vector = reader.GetTermFreqVector(docId, "field", null);
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                vectors    = new ITermFreqVector[1];
                vectors[0] = vector;

                VerifyVectors(vectors, docId);
            }
        }
예제 #3
0
        /// <summary> Find words for a more-like-this query former.
        ///
        /// </summary>
        /// <param name="docNum">the id of the lucene document from which to find terms
        /// </param>
        protected virtual PriorityQueue <object[]> RetrieveTerms(int docNum)
        {
            IDictionary <string, Int> termFreqMap = new HashMap <string, Int>();

            for (int i = 0; i < fieldNames.Length; i++)
            {
                System.String   fieldName = fieldNames[i];
                ITermFreqVector vector    = ir.GetTermFreqVector(docNum, fieldName);

                // field does not store term vector info
                if (vector == null)
                {
                    Document        d    = ir.Document(docNum);
                    System.String[] text = d.GetValues(fieldName);
                    if (text != null)
                    {
                        for (int j = 0; j < text.Length; j++)
                        {
                            AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName);
                        }
                    }
                }
                else
                {
                    AddTermFrequencies(termFreqMap, vector);
                }
            }

            return(CreateQueue(termFreqMap));
        }
예제 #4
0
        public TermVector(string fieldName, ITermFreqVector tfv)
        {
            //
            // Required for Windows Form Designer support
            //
            InitializeComponent();

            lblField.Text = fieldName;
            IntPair[] tvs   = new IntPair[tfv.Size];
            String[]  terms = tfv.GetTerms();
            int[]     freqs = tfv.GetTermFrequencies();
            for (int i = 0; i < terms.Length; i++)
            {
                IntPair ip = new IntPair(freqs[i], terms[i]);
                tvs[i] = ip;
            }
            Array.Sort(tvs, new IntPair.PairComparator(false, true));

            listViewTVF.BeginUpdate();

            for (int i = 0; i < tvs.Length; i++)
            {
                ListViewItem item = new ListViewItem(
                    new string[] { tvs[i].cnt.ToString(), tvs[i].text });
                listViewTVF.Items.Add(item);
            }

            listViewTVF.EndUpdate();
        }
        public virtual void  TestMixedTermVectorSettingsSameField()
        {
            Document doc = new Document();

            // f1 first without tv then with tv
            doc.Add(new Field("f1", "v1", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO));
            doc.Add(new Field("f1", "v2", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
            // f2 first with tv then without tv
            doc.Add(new Field("f2", "v1", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
            doc.Add(new Field("f2", "v2", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO));

            IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT), true,
                                                 IndexWriter.MaxFieldLength.LIMITED);

            writer.AddDocument(doc);
            writer.Close();

            _TestUtil.CheckIndex(dir);

            IndexReader reader = IndexReader.Open(dir, true);
            // f1
            ITermFreqVector tfv1 = reader.GetTermFreqVector(0, "f1");

            Assert.IsNotNull(tfv1);
            Assert.AreEqual(2, tfv1.GetTerms().Length, "the 'with_tv' setting should rule!");
            // f2
            ITermFreqVector tfv2 = reader.GetTermFreqVector(0, "f2");

            Assert.IsNotNull(tfv2);
            Assert.AreEqual(2, tfv2.GetTerms().Length, "the 'with_tv' setting should rule!");
        }
예제 #6
0
        public virtual void  TestMerge()
        {
            SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment);

            merger.Add(reader1);
            merger.Add(reader2);
            int docsMerged = merger.Merge(null);

            merger.CloseReaders();
            Assert.IsTrue(docsMerged == 2);
            //Should be able to open a new SegmentReader against the new directory
            SegmentReader mergedReader = SegmentReader.Get(true, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true), IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null);

            Assert.IsTrue(mergedReader != null);
            Assert.IsTrue(mergedReader.NumDocs() == 2);
            Document newDoc1 = mergedReader.Document(0, null);

            Assert.IsTrue(newDoc1 != null);
            //There are 2 unstored fields on the document
            Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count);
            Document newDoc2 = mergedReader.Document(1, null);

            Assert.IsTrue(newDoc2 != null);
            Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count);

            TermDocs termDocs = mergedReader.TermDocs(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"), null);

            Assert.IsTrue(termDocs != null);
            Assert.IsTrue(termDocs.Next(null) == true);

            System.Collections.Generic.ICollection <string> stored = mergedReader.GetFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR);
            Assert.IsTrue(stored != null);
            //System.out.println("stored size: " + stored.size());
            Assert.IsTrue(stored.Count == 3, "We do not have 3 fields that were indexed with term vector");

            ITermFreqVector vector = mergedReader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY, null);

            Assert.IsTrue(vector != null);
            System.String[] terms = vector.GetTerms();
            Assert.IsTrue(terms != null);
            //System.out.println("Terms size: " + terms.length);
            Assert.IsTrue(terms.Length == 3);
            int[] freqs = vector.GetTermFrequencies();
            Assert.IsTrue(freqs != null);
            //System.out.println("Freqs size: " + freqs.length);
            Assert.IsTrue(vector is TermPositionVector == true);

            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                int           freq = freqs[i];
                //System.out.println("Term: " + term + " Freq: " + freq);
                Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                Assert.IsTrue(DocHelper.FIELD_2_FREQS[i] == freq);
            }

            TestSegmentReader.CheckNorms(mergedReader);
        }
        // поиск с указанием найденной позиции в тексте
        public void DoSearch(String db, String querystr, global::Lucene.Net.Store.Directory indexDirectory)
        {
            // 1. Specify the analyzer for tokenizing text.
            //    The same analyzer should be used as was used for indexing
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30, ListStopWords);


            // 2. query
            Query q = new QueryParser(Version.LUCENE_30, "LineText", analyzer).Parse(querystr);

            // 3. search
            int           hitsPerPage = 10;
            IndexSearcher searcher    = new IndexSearcher(indexDirectory, true);
            IndexReader   reader      = IndexReader.Open(indexDirectory, true);

            searcher.SetDefaultFieldSortScoring(true, false);
            TopScoreDocCollector collector = TopScoreDocCollector.Create(hitsPerPage, true);

            searcher.Search(q, collector);
            ScoreDoc[] hits = collector.TopDocs().ScoreDocs;

            // 4. display term positions, and term indexes
            MessageBox.Show("Found " + hits.Length + " hits.");

            for (int i = 0; i < hits.Length; ++i)
            {
                int                docId    = hits[i].Doc;
                ITermFreqVector    tfvector = reader.GetTermFreqVector(docId, "LineText");
                TermPositionVector tpvector = (TermPositionVector)tfvector;
                // this part works only if there is one term in the query string,
                // otherwise you will have to iterate this section over the query terms.
                int   termidx  = tfvector.IndexOf(querystr);
                int[] termposx = tpvector.GetTermPositions(termidx);
                TermVectorOffsetInfo[] tvoffsetinfo = tpvector.GetOffsets(termidx);

                for (int j = 0; j < termposx.Length; j++)
                {
                    MessageBox.Show("termpos : " + termposx[j]);
                }
                for (int j = 0; j < tvoffsetinfo.Length; j++)
                {
                    int offsetStart = tvoffsetinfo[j].StartOffset;
                    int offsetEnd   = tvoffsetinfo[j].EndOffset;
                    MessageBox.Show("offsets : " + offsetStart + " " + offsetEnd);
                }

                // print some info about where the hit was found...
                Document d = searcher.Doc(docId);
                MessageBox.Show((i + 1) + ". " + d.Get("path"));
            }

            // searcher can only be closed when there
            // is no need to access the documents any more.
            searcher.Dispose();
        }
예제 #8
0
        public virtual void  TestPositionReader()
        {
            TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            TermPositionVector vector;

            System.String[] terms;
            vector = (TermPositionVector)reader.Get(0, testFields[0]);
            Assert.IsTrue(vector != null);
            terms = vector.GetTerms();
            Assert.IsTrue(terms != null);
            Assert.IsTrue(terms.Length == testTerms.Length);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                //System.out.println("Term: " + term);
                Assert.IsTrue(term.Equals(testTerms[i]));
                int[] positions = vector.GetTermPositions(i);
                Assert.IsTrue(positions != null);
                Assert.IsTrue(positions.Length == this.positions[i].Length);
                for (int j = 0; j < positions.Length; j++)
                {
                    int position = positions[j];
                    Assert.IsTrue(position == this.positions[i][j]);
                }
                TermVectorOffsetInfo[] offset = vector.GetOffsets(i);
                Assert.IsTrue(offset != null);
                Assert.IsTrue(offset.Length == this.offsets[i].Length);
                for (int j = 0; j < offset.Length; j++)
                {
                    TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
                    Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j]));
                }
            }

            ITermFreqVector freqVector = reader.Get(0, testFields[1]);             //no pos, no offset

            Assert.IsTrue(freqVector != null);
            Assert.IsTrue(freqVector is TermPositionVector == false);
            terms = freqVector.GetTerms();
            Assert.IsTrue(terms != null);
            Assert.IsTrue(terms.Length == testTerms.Length);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                //System.out.println("Term: " + term);
                Assert.IsTrue(term.Equals(testTerms[i]));
            }
        }
예제 #9
0
        private Dictionary <string, float> getTfForDoc(string filename, IndexReader reader)
        {
            ITermFreqVector termFreqVector = reader.GetTermFreqVector(docsInfo[filename], "text");

            string[] terms       = termFreqVector.GetTerms();
            int[]    termFreqs   = termFreqVector.GetTermFrequencies();
            var      total_words = termFreqs.Sum();
            var      results     = terms.Zip(termFreqs,
                                             (term, freq) => new
            {
                Key   = term,
                Value = (float)freq / total_words
            }).ToDictionary(x => x.Key, x => x.Value);

            return(results);
        }
예제 #10
0
        // get all vectors
        public override ITermFreqVector[] GetTermFreqVectors(int n, IState state)
        {
            EnsureOpen();
            IList <ITermFreqVector> results = new List <ITermFreqVector>();

            foreach (var e in fieldToReader)
            {
                System.String field  = e.Key;
                IndexReader   reader = e.Value;

                ITermFreqVector vector = reader.GetTermFreqVector(n, field, state);
                if (vector != null)
                {
                    results.Add(vector);
                }
            }
            return(results.ToArray());
        }
예제 #11
0
        public virtual void  DoTestDocument()
        {
            sis.Read(dir);
            IndexReader reader = OpenReader();

            Assert.IsTrue(reader != null);
            Document newDoc1 = reader.Document(0);

            Assert.IsTrue(newDoc1 != null);
            Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count);
            Document newDoc2 = reader.Document(1);

            Assert.IsTrue(newDoc2 != null);
            Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count);
            ITermFreqVector vector = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);

            Assert.IsTrue(vector != null);
            TestSegmentReader.CheckNorms(reader);
        }
예제 #12
0
        public virtual void  TestReader()
        {
            TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            for (int j = 0; j < 5; j++)
            {
                ITermFreqVector vector = reader.Get(j, testFields[0]);
                Assert.IsTrue(vector != null);
                System.String[] terms = vector.GetTerms();
                Assert.IsTrue(terms != null);
                Assert.IsTrue(terms.Length == testTerms.Length);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    Assert.IsTrue(term.Equals(testTerms[i]));
                }
            }
        }
예제 #13
0
        public virtual void  TestTermVectors()
        {
            ITermFreqVector result = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY, null);

            Assert.IsTrue(result != null);
            System.String[] terms = result.GetTerms();
            int[]           freqs = result.GetTermFrequencies();
            Assert.IsTrue(terms != null && terms.Length == 3 && freqs != null && freqs.Length == 3);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                int           freq = freqs[i];
                Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                Assert.IsTrue(freq > 0);
            }

            ITermFreqVector[] results = reader.GetTermFreqVectors(0, null);
            Assert.IsTrue(results != null);
            Assert.IsTrue(results.Length == 3, "We do not have 3 term freq vectors, we have: " + results.Length);
        }
예제 #14
0
        public virtual void  TestBadParams()
        {
            var reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            //Bad document number, good field number
            Assert.Throws <System.IO.IOException>(() => reader.Get(50, testFields[0]));

            reader = new TermVectorsReader(dir, seg, fieldInfos);
            Assert.IsTrue(reader != null);
            //Bad document number, no field
            Assert.Throws <System.IO.IOException>(() => reader.Get(50));

            reader = new TermVectorsReader(dir, seg, fieldInfos);
            Assert.IsTrue(reader != null);
            Assert.DoesNotThrow(() =>
            {
                //good document number, bad field number
                ITermFreqVector vector = reader.Get(0, "f50");
                Assert.IsTrue(vector == null);
            });
        }
		private void  VerifyVectors(ITermFreqVector[] vectors, int num)
		{
			System.Text.StringBuilder temp = new System.Text.StringBuilder();
			System.String[] terms = null;
			for (int i = 0; i < vectors.Length; i++)
			{
				terms = vectors[i].GetTerms();
				for (int z = 0; z < terms.Length; z++)
				{
					temp.Append(terms[z]);
				}
			}
			
			if (!English.IntToEnglish(num).Trim().Equals(temp.ToString().Trim()))
				System.Console.Out.WriteLine("wrong term result");
		}
예제 #16
0
        public static void  VerifyEquals(ITermFreqVector[] d1, ITermFreqVector[] d2)
        {
            if (d1 == null)
            {
                Assert.IsTrue(d2 == null);
                return;
            }
            Assert.IsTrue(d2 != null);

            Assert.AreEqual(d1.Length, d2.Length);
            for (int i = 0; i < d1.Length; i++)
            {
                ITermFreqVector v1 = d1[i];
                ITermFreqVector v2 = d2[i];
                if (v1 == null || v2 == null)
                {
                    System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length);
                }
                Assert.AreEqual(v1.Size, v2.Size);
                int             numTerms = v1.Size;
                System.String[] terms1   = v1.GetTerms();
                System.String[] terms2   = v2.GetTerms();
                int[]           freq1    = v1.GetTermFrequencies();
                int[]           freq2    = v2.GetTermFrequencies();
                for (int j = 0; j < numTerms; j++)
                {
                    if (!terms1[j].Equals(terms2[j]))
                    {
                        Assert.AreEqual(terms1[j], terms2[j]);
                    }
                    Assert.AreEqual(freq1[j], freq2[j]);
                }
                if (v1 is TermPositionVector)
                {
                    Assert.IsTrue(v2 is TermPositionVector);
                    TermPositionVector tpv1 = (TermPositionVector)v1;
                    TermPositionVector tpv2 = (TermPositionVector)v2;
                    for (int j = 0; j < numTerms; j++)
                    {
                        int[] pos1 = tpv1.GetTermPositions(j);
                        int[] pos2 = tpv2.GetTermPositions(j);
                        Assert.AreEqual(pos1.Length, pos2.Length);
                        TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j);
                        TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j);
                        if (offsets1 == null)
                        {
                            Assert.IsTrue(offsets2 == null);
                        }
                        else
                        {
                            Assert.IsTrue(offsets2 != null);
                        }
                        for (int k = 0; k < pos1.Length; k++)
                        {
                            Assert.AreEqual(pos1[k], pos2[k]);
                            if (offsets1 != null)
                            {
                                Assert.AreEqual(offsets1[k].StartOffset, offsets2[k].StartOffset);
                                Assert.AreEqual(offsets1[k].EndOffset, offsets2[k].EndOffset);
                            }
                        }
                    }
                }
            }
        }
예제 #17
0
		public static void  VerifyEquals(ITermFreqVector[] d1, ITermFreqVector[] d2)
		{
			if (d1 == null)
			{
				Assert.IsTrue(d2 == null);
				return ;
			}
			Assert.IsTrue(d2 != null);
			
			Assert.AreEqual(d1.Length, d2.Length);
			for (int i = 0; i < d1.Length; i++)
			{
				ITermFreqVector v1 = d1[i];
				ITermFreqVector v2 = d2[i];
				if (v1 == null || v2 == null)
				{
					System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length);
				}
				Assert.AreEqual(v1.Size, v2.Size);
				int numTerms = v1.Size;
				System.String[] terms1 = v1.GetTerms();
				System.String[] terms2 = v2.GetTerms();
				int[] freq1 = v1.GetTermFrequencies();
				int[] freq2 = v2.GetTermFrequencies();
				for (int j = 0; j < numTerms; j++)
				{
					if (!terms1[j].Equals(terms2[j]))
						Assert.AreEqual(terms1[j], terms2[j]);
					Assert.AreEqual(freq1[j], freq2[j]);
				}
				if (v1 is TermPositionVector)
				{
					Assert.IsTrue(v2 is TermPositionVector);
					TermPositionVector tpv1 = (TermPositionVector) v1;
					TermPositionVector tpv2 = (TermPositionVector) v2;
					for (int j = 0; j < numTerms; j++)
					{
						int[] pos1 = tpv1.GetTermPositions(j);
						int[] pos2 = tpv2.GetTermPositions(j);
						Assert.AreEqual(pos1.Length, pos2.Length);
						TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j);
						TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j);
						if (offsets1 == null)
							Assert.IsTrue(offsets2 == null);
						else
							Assert.IsTrue(offsets2 != null);
						for (int k = 0; k < pos1.Length; k++)
						{
							Assert.AreEqual(pos1[k], pos2[k]);
							if (offsets1 != null)
							{
								Assert.AreEqual(offsets1[k].StartOffset, offsets2[k].StartOffset);
								Assert.AreEqual(offsets1[k].EndOffset, offsets2[k].EndOffset);
							}
						}
					}
				}
			}
		}
		/// <summary> Add a complete document specified by all its term vectors. If document has no
		/// term vectors, add value for tvx.
		/// 
		/// </summary>
		/// <param name="vectors">
		/// </param>
		/// <throws>  IOException </throws>
		public void  AddAllDocVectors(ITermFreqVector[] vectors)
		{
			
			tvx.WriteLong(tvd.FilePointer);
			tvx.WriteLong(tvf.FilePointer);
			
			if (vectors != null)
			{
				int numFields = vectors.Length;
				tvd.WriteVInt(numFields);
				
				var fieldPointers = new long[numFields];
				
				for (int i = 0; i < numFields; i++)
				{
					fieldPointers[i] = tvf.FilePointer;
					
					int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field);
					
					// 1st pass: write field numbers to tvd
					tvd.WriteVInt(fieldNumber);
					
					int numTerms = vectors[i].Size;
					tvf.WriteVInt(numTerms);
					
					TermPositionVector tpVector;
					
					byte bits;
					bool storePositions;
					bool storeOffsets;
					
					if (vectors[i] is TermPositionVector)
					{
						// May have positions & offsets
						tpVector = (TermPositionVector) vectors[i];
						storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null;
						storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null;
						bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0));
					}
					else
					{
						tpVector = null;
						bits = 0;
						storePositions = false;
						storeOffsets = false;
					}
					
					tvf.WriteVInt(bits);
					
					System.String[] terms = vectors[i].GetTerms();
					int[] freqs = vectors[i].GetTermFrequencies();
					
					int utf8Upto = 0;
					utf8Results[1].length = 0;
					
					for (int j = 0; j < numTerms; j++)
					{
						
						UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);
						
						int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);
						int length = utf8Results[utf8Upto].length - start;
						tvf.WriteVInt(start); // write shared prefix length
						tvf.WriteVInt(length); // write delta length
						tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
						utf8Upto = 1 - utf8Upto;
						
						int termFreq = freqs[j];
						
						tvf.WriteVInt(termFreq);
						
						if (storePositions)
						{
							int[] positions = tpVector.GetTermPositions(j);
							if (positions == null)
								throw new System.SystemException("Trying to write positions that are null!");
							System.Diagnostics.Debug.Assert(positions.Length == termFreq);
							
							// use delta encoding for positions
							int lastPosition = 0;
							foreach (int position in positions)
							{
								tvf.WriteVInt(position - lastPosition);
								lastPosition = position;
							}
						}
						
						if (storeOffsets)
						{
							TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
							if (offsets == null)
								throw new System.SystemException("Trying to write offsets that are null!");
							System.Diagnostics.Debug.Assert(offsets.Length == termFreq);
							
							// use delta encoding for offsets
							int lastEndOffset = 0;
							foreach (TermVectorOffsetInfo t in offsets)
							{
								int startOffset = t.StartOffset;
								int endOffset = t.EndOffset;
								tvf.WriteVInt(startOffset - lastEndOffset);
								tvf.WriteVInt(endOffset - startOffset);
								lastEndOffset = endOffset;
							}
						}
					}
				}
				
				// 2nd pass: write field pointers to tvd
				if (numFields > 1)
				{
					long lastFieldPointer = fieldPointers[0];
					for (int i = 1; i < numFields; i++)
					{
						long fieldPointer = fieldPointers[i];
						tvd.WriteVLong(fieldPointer - lastFieldPointer);
						lastFieldPointer = fieldPointer;
					}
				}
			}
			else
				tvd.WriteVInt(0);
		}
예제 #19
0
        protected static BrowseHit[] BuildHits(MyScoreDoc[] scoreDocs, SortField[] sortFields,
                                               IDictionary <string, IFacetHandler> facetHandlerMap, bool fetchStoredFields,
                                               IEnumerable <string> termVectorsToFetch, IFacetHandler groupBy, CombinedFacetAccessible[] groupAccessibles)
        {
            BrowseHit[] hits = new BrowseHit[scoreDocs.Length];
            IEnumerable <IFacetHandler> facetHandlers = facetHandlerMap.Values;

            for (int i = scoreDocs.Length - 1; i >= 0; i--)
            {
                MyScoreDoc      fdoc   = scoreDocs[i];
                BoboIndexReader reader = fdoc.reader;
                BrowseHit       hit    = new BrowseHit();
                if (fetchStoredFields)
                {
                    hit.StoredFields = reader.Document(fdoc.Doc);
                }
                if (termVectorsToFetch != null && termVectorsToFetch.Count() > 0)
                {
                    var tvMap = new Dictionary <string, BrowseHit.TermFrequencyVector>();
                    hit.TermFreqMap = tvMap;
                    foreach (string field in termVectorsToFetch)
                    {
                        ITermFreqVector tv = reader.GetTermFreqVector(fdoc.Doc, field);
                        if (tv != null)
                        {
                            int[]    freqs = tv.GetTermFrequencies();
                            string[] terms = tv.GetTerms();
                            tvMap[field] = new BrowseHit.TermFrequencyVector(terms, freqs);
                        }
                    }
                }
                var map    = new Dictionary <string, string[]>();
                var rawMap = new Dictionary <string, object[]>();
                foreach (var facetHandler in facetHandlers)
                {
                    map[facetHandler.Name]    = facetHandler.GetFieldValues(reader, fdoc.Doc);
                    rawMap[facetHandler.Name] = facetHandler.GetRawFieldValues(reader, fdoc.Doc);
                }
                hit.FieldValues    = map;
                hit.RawFieldValues = rawMap;
                hit.DocId          = fdoc.Doc + fdoc.queue.@base;
                hit.Score          = fdoc.Score;
                hit.Comparable     = fdoc.Value;
                if (groupBy != null)
                {
                    hit.GroupField    = groupBy.Name;
                    hit.GroupValue    = hit.GetField(groupBy.Name);
                    hit.RawGroupValue = hit.GetRawField(groupBy.Name);
                    if (groupAccessibles != null &&
                        hit.GroupValue != null &&
                        groupAccessibles != null &&
                        groupAccessibles.Length > 0)
                    {
                        BrowseFacet facet = groupAccessibles[0].GetFacet(hit.GroupValue);
                        hit.GroupHitsCount = facet.FacetValueHitCount;
                    }
                }
                hits[i] = hit;
            }
            return(hits);
        }
예제 #20
0
        public virtual void  TestTermPositionVectors()
        {
            Query query = new TermQuery(new Term("field", "zero"));

            try
            {
                ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
                Assert.AreEqual(1, hits.Length);

                for (int i = 0; i < hits.Length; i++)
                {
                    ITermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].Doc);
                    Assert.IsTrue(vector != null);
                    Assert.IsTrue(vector.Length == 1);

                    bool shouldBePosVector = (hits[i].Doc % 2 == 0)?true:false;
                    Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true)));

                    bool shouldBeOffVector = (hits[i].Doc % 3 == 0)?true:false;
                    Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true)));

                    if (shouldBePosVector || shouldBeOffVector)
                    {
                        TermPositionVector posVec = (TermPositionVector)vector[0];
                        System.String[]    terms  = posVec.GetTerms();
                        Assert.IsTrue(terms != null && terms.Length > 0);

                        for (int j = 0; j < terms.Length; j++)
                        {
                            int[] positions = posVec.GetTermPositions(j);
                            TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j);

                            if (shouldBePosVector)
                            {
                                Assert.IsTrue(positions != null);
                                Assert.IsTrue(positions.Length > 0);
                            }
                            else
                            {
                                Assert.IsTrue(positions == null);
                            }

                            if (shouldBeOffVector)
                            {
                                Assert.IsTrue(offsets != null);
                                Assert.IsTrue(offsets.Length > 0);
                            }
                            else
                            {
                                Assert.IsTrue(offsets == null);
                            }
                        }
                    }
                    else
                    {
                        try
                        {
                            TermPositionVector posVec = (TermPositionVector)vector[0];
                            Assert.IsTrue(false);
                        }
                        catch (System.InvalidCastException ignore)
                        {
                            ITermFreqVector freqVec = vector[0];
                            System.String[] terms   = freqVec.GetTerms();
                            Assert.IsTrue(terms != null && terms.Length > 0);
                        }
                    }
                }
            }
            catch (System.IO.IOException)
            {
                Assert.IsTrue(false);
            }
        }
		private void  TestTermVectors()
		{
			// check:
			int numDocs = reader.NumDocs();
			long start = 0L;
			for (int docId = 0; docId < numDocs; docId++)
			{
				start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
				ITermFreqVector[] vectors = reader.GetTermFreqVectors(docId);
				timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;
				
				// verify vectors result
				VerifyVectors(vectors, docId);
				
				start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
				ITermFreqVector vector = reader.GetTermFreqVector(docId, "field");
				timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;
				
				vectors = new ITermFreqVector[1];
				vectors[0] = vector;
				
				VerifyVectors(vectors, docId);
			}
		}
예제 #22
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String test1 = "eating chocolate in a computer lab";                                             //6 terms
            System.String test2 = "computer in a computer lab";                                                     //5 terms
            System.String test3 = "a chocolate lab grows old";                                                      //5 terms
            System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new MockRAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir, true);
                TermEnum      termEnum      = knownSearcher.reader_ForNUnit.Terms();
                TermDocs      termDocs      = knownSearcher.reader_ForNUnit.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.Similarity;
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term;
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc;
                        int freq  = termDocs.Freq;
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        ITermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                        float           tf     = sim.Tf(freq);
                        float           idf    = sim.Idf(knownSearcher.DocFreq(term), knownSearcher.MaxDoc);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text.Equals(vTerms[i]))
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query      query = new TermQuery(new Term("field", "chocolate"));
                ScoreDoc[] hits  = knownSearcher.Search(query, null, 1000).ScoreDocs;
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length == 3);
                float score = hits[0].Score;

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(hits[0].Doc == 2);
                Assert.IsTrue(hits[1].Doc == 3);
                Assert.IsTrue(hits[2].Doc == 0);
                ITermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, "field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32 freqInt = -1;
                    try
                    {
                        freqInt = (System.Int32)test4Map[term];
                    }
                    catch (Exception)
                    {
                        Assert.IsTrue(false);
                    }
                    Assert.IsTrue(freqInt == freq);
                }
                SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, mapper);
                var vectorEntrySet = mapper.TermVectorEntrySet;
                Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
                TermVectorEntry last = null;
                foreach (TermVectorEntry tve in vectorEntrySet)
                {
                    if (tve != null && last != null)
                    {
                        Assert.IsTrue(last.Frequency >= tve.Frequency, "terms are not properly sorted");
                        System.Int32 expectedFreq = (System.Int32)test4Map[tve.Term];
                        //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                        Assert.IsTrue(tve.Frequency == 2 * expectedFreq, "Frequency is not correct:");
                    }
                    last = tve;
                }

                FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].Doc, fieldMapper);
                var map = fieldMapper.FieldToTerms;
                Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
                vectorEntrySet = map["field"];
                Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
                Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }