コード例 #1
0
ファイル: TermVector.cs プロジェクト: mammo/LukeSharp
        public TermVector(string fieldName, TermFreqVector tfv)
        {
            //
            // Required for Windows Form Designer support
            //
            InitializeComponent();

            lblField.Text = fieldName;

            List<TermFrequency> tvs = new List<TermFrequency>(tfv.Size());

            String[] terms = tfv.GetTerms();
            int[] freqs = tfv.GetTermFrequencies();
            for (int i = 0; i < terms.Length; i++)
            {
                tvs.Add(new TermFrequency(terms[i], freqs[i]));
            }

            tvs.OrderBy( p => p.Term);

            listViewTVF.BeginUpdate();

            foreach(TermFrequency tf in tvs) {
                ListViewItem item = new ListViewItem(new string[]{tf.Frequency.ToString(), tf.Term});
                listViewTVF.Items.Add(item);
            }

            listViewTVF.EndUpdate();
        }
コード例 #2
0
ファイル: ParallelReader.cs プロジェクト: vernon016/mono
 // get all vectors
 public override TermFreqVector[] GetTermFreqVectors(int n)
 {
     EnsureOpen();
     System.Collections.ArrayList   results = new System.Collections.ArrayList();
     System.Collections.IEnumerator i       = new System.Collections.Hashtable(fieldToReader).GetEnumerator();
     while (i.MoveNext())
     {
         System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry)i.Current;
         System.String  field  = (System.String)e.Key;
         IndexReader    reader = (IndexReader)e.Value;
         TermFreqVector vector = reader.GetTermFreqVector(n, field);
         if (vector != null)
         {
             results.Add(vector);
         }
     }
     return((TermFreqVector[])results.ToArray(typeof(TermFreqVector)));
 }
コード例 #3
0
        // get all vectors
        public override TermFreqVector[] GetTermFreqVectors(int n)
        {
            EnsureOpen();
            List <TermFreqVector> results = new List <TermFreqVector>();
            IEnumerator <KeyValuePair <string, IndexReader> > i = fieldToReader.GetEnumerator();

            while (i.MoveNext())
            {
                KeyValuePair <string, IndexReader> e = i.Current;
                string         field  = e.Key;
                IndexReader    reader = e.Value;
                TermFreqVector vector = reader.GetTermFreqVector(n, field);
                if (vector != null)
                {
                    results.Add(vector);
                }
            }
            return(results.ToArray());
        }
コード例 #4
0
        public virtual void  DoTestDocument()
        {
            sis.Read(dir);
            IndexReader reader = OpenReader();

            Assert.IsTrue(reader != null);
            Document newDoc1 = reader.Document(0);

            Assert.IsTrue(newDoc1 != null);
            Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - DocHelper.unstored.Count);
            Document newDoc2 = reader.Document(1);

            Assert.IsTrue(newDoc2 != null);
            Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - DocHelper.unstored.Count);
            TermFreqVector vector = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);

            Assert.IsTrue(vector != null);
            TestSegmentReader.CheckNorms(reader);
        }
コード例 #5
0
 // get all vectors
 public override TermFreqVector[] GetTermFreqVectors(int n)
 {
     System.Collections.ArrayList   results = new System.Collections.ArrayList();
     System.Collections.IEnumerator i       = new System.Collections.Hashtable(fieldToReader).GetEnumerator();
     while (i.MoveNext())
     {
         System.Collections.DictionaryEntry e = (System.Collections.DictionaryEntry)i.Current;
         //IndexReader reader = (IndexReader) e.Key;         // {{Aroush}} which is right, those two lines?
         //System.String field = (System.String) e.Value;
         System.String  field  = (System.String)e.Key;       // {{Aroush-2.0}} or those two lines?
         IndexReader    reader = (IndexReader)e.Value;
         TermFreqVector vector = reader.GetTermFreqVector(n, field);
         if (vector != null)
         {
             results.Add(vector);
         }
     }
     return((TermFreqVector[])(results.ToArray(typeof(TermFreqVector))));
 }
コード例 #6
0
        internal void ShowTV()
        {
            if (listDocFields.SelectedItems.Count == 0)
            {
                return;
            }
            if (_luke.IndexReader == null)
            {
                _luke.ShowStatus(_luke.resources.GetString("NoIndex"));
                return;
            }

            int docId;

            try
            {
                docId = Int32.Parse(textDocNum.Text);
            }
            catch (Exception)
            {
                _luke.ShowStatus(_luke.resources.GetString("DocNotSelected"));
                return;
            }

            try
            {
                string fieldName = listDocFields.SelectedItems[0].SubItems[0].Text;
                fieldName = fieldName.Substring(1, fieldName.Length - 2);
                TermFreqVector tfv = _luke.IndexReader.GetTermFreqVector(docId, fieldName);
                if (tfv == null)
                {
                    _luke.ShowStatus(_luke.resources.GetString("NoTV"));
                    return;
                }

                TermVector tvDialog = new TermVector(fieldName, tfv);
                tvDialog.ShowDialog(this);
            }
            catch (Exception exc)
            {
                _luke.ShowStatus(exc.Message);
            }
        }
コード例 #7
0
        public virtual void  TestTermVectors()
        {
            TermFreqVector result = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);

            Assert.IsTrue(result != null);
            System.String[] terms = result.GetTerms();
            int[]           freqs = result.GetTermFrequencies();
            Assert.IsTrue(terms != null && terms.Length == 3 && freqs != null && freqs.Length == 3);
            for (int i = 0; i < terms.Length; i++)
            {
                System.String term = terms[i];
                int           freq = freqs[i];
                Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1);
                Assert.IsTrue(freq > 0);
            }

            TermFreqVector[] results = reader.GetTermFreqVectors(0);
            Assert.IsTrue(results != null);
            Assert.IsTrue(results.Length == 4, "We do not have 4 term freq vectors, we have: " + results.Length);
        }
コード例 #8
0
        public virtual void  TestReader()
        {
            TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

            Assert.IsTrue(reader != null);
            for (int j = 0; j < 5; j++)
            {
                TermFreqVector vector = reader.Get(j, testFields[0]);
                Assert.IsTrue(vector != null);
                System.String[] terms = vector.GetTerms();
                Assert.IsTrue(terms != null);
                Assert.IsTrue(terms.Length == testTerms.Length);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    Assert.IsTrue(term.Equals(testTerms[i]));
                }
            }
        }
コード例 #9
0
 public virtual void  TestBadParams()
 {
     try
     {
         TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
         Assert.IsTrue(reader != null);
         //Bad document number, good field number
         reader.Get(50, testFields[0]);
         Assert.Fail();
     }
     catch (System.IO.IOException e)
     {
         // expected exception
     }
     try
     {
         TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
         Assert.IsTrue(reader != null);
         //Bad document number, no field
         reader.Get(50);
         Assert.Fail();
     }
     catch (System.IO.IOException e)
     {
         // expected exception
     }
     try
     {
         TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
         Assert.IsTrue(reader != null);
         //good document number, bad field number
         TermFreqVector vector = reader.Get(0, "f50");
         Assert.IsTrue(vector == null);
     }
     catch (System.IO.IOException e)
     {
         Assert.Fail();
     }
 }
コード例 #10
0
 public virtual void  TestDocument()
 {
     try
     {
         sis.Read(dir);
         MultiReader reader = new MultiReader(dir, sis, false, readers);
         Assert.IsTrue(reader != null);
         Document newDoc1 = reader.Document(0);
         Assert.IsTrue(newDoc1 != null);
         Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(doc1) - 2);
         Document newDoc2 = reader.Document(1);
         Assert.IsTrue(newDoc2 != null);
         Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(doc2) - 2);
         TermFreqVector vector = reader.GetTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
         Assert.IsTrue(vector != null);
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
 }
コード例 #11
0
ファイル: TestTermVectorsReader.cs プロジェクト: yonder/mono
 public virtual void  TestReader()
 {
     try
     {
         TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
         Assert.IsTrue(reader != null);
         TermFreqVector vector = reader.Get(0, testFields[0]);
         Assert.IsTrue(vector != null);
         System.String[] terms = vector.GetTerms();
         Assert.IsTrue(terms != null);
         Assert.IsTrue(terms.Length == testTerms.Length);
         for (int i = 0; i < terms.Length; i++)
         {
             System.String term = terms[i];
             //System.out.println("Term: " + term);
             Assert.IsTrue(term.Equals(testTerms[i]));
         }
     }
     catch (System.IO.IOException e)
     {
         System.Console.Error.WriteLine(e.StackTrace);
         Assert.IsTrue(false);
     }
 }
コード例 #12
0
		public static void  VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2)
		{
			if (d1 == null)
			{
				Assert.IsTrue(d2 == null);
				return ;
			}
			Assert.IsTrue(d2 != null);
			
			Assert.AreEqual(d1.Length, d2.Length);
			for (int i = 0; i < d1.Length; i++)
			{
				TermFreqVector v1 = d1[i];
				TermFreqVector v2 = d2[i];
				if (v1 == null || v2 == null)
				{
					System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length);
				}
				Assert.AreEqual(v1.Size(), v2.Size());
				int numTerms = v1.Size();
				System.String[] terms1 = v1.GetTerms();
				System.String[] terms2 = v2.GetTerms();
				int[] freq1 = v1.GetTermFrequencies();
				int[] freq2 = v2.GetTermFrequencies();
				for (int j = 0; j < numTerms; j++)
				{
					if (!terms1[j].Equals(terms2[j]))
						Assert.AreEqual(terms1[j], terms2[j]);
					Assert.AreEqual(freq1[j], freq2[j]);
				}
				if (v1 is TermPositionVector)
				{
					Assert.IsTrue(v2 is TermPositionVector);
					TermPositionVector tpv1 = (TermPositionVector) v1;
					TermPositionVector tpv2 = (TermPositionVector) v2;
					for (int j = 0; j < numTerms; j++)
					{
						int[] pos1 = tpv1.GetTermPositions(j);
						int[] pos2 = tpv2.GetTermPositions(j);
						Assert.AreEqual(pos1.Length, pos2.Length);
						TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j);
						TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j);
						if (offsets1 == null)
							Assert.IsTrue(offsets2 == null);
						else
							Assert.IsTrue(offsets2 != null);
						for (int k = 0; k < pos1.Length; k++)
						{
							Assert.AreEqual(pos1[k], pos2[k]);
							if (offsets1 != null)
							{
								Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset());
								Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset());
							}
						}
					}
				}
			}
		}
コード例 #13
0
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String test1 = "eating chocolate in a computer lab";                                             //6 terms
            System.String test2 = "computer in a computer lab";                                                     //5 terms
            System.String test3 = "a chocolate lab grows old";                                                      //5 terms
            System.String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new MockRAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader_ForNUnit.Terms();
                TermDocs      termDocs      = knownSearcher.reader_ForNUnit.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader_ForNUnit.GetTermFreqVector(docId, "field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]))
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query      query = new TermQuery(new Term("field", "chocolate"));
                ScoreDoc[] hits  = knownSearcher.Search(query, null, 1000).scoreDocs;
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length == 3);
                float score = hits[0].score;

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(hits[0].doc == 2);
                Assert.IsTrue(hits[1].doc == 3);
                Assert.IsTrue(hits[2].doc == 0);
                TermFreqVector vector2 = knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, "field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32 freqInt = -1;
                    try
                    {
                        freqInt = (System.Int32)test4Map[term];
                    }
                    catch (Exception)
                    {
                        Assert.IsTrue(false);
                    }
                    Assert.IsTrue(freqInt == freq);
                }
                SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, mapper);
                System.Collections.Generic.SortedDictionary <object, object> vectorEntrySet = mapper.GetTermVectorEntrySet();
                Assert.IsTrue(vectorEntrySet.Count == 10, "mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.Count + " is not: " + 10);
                TermVectorEntry last = null;
                foreach (TermVectorEntry tve in vectorEntrySet.Keys)
                {
                    if (tve != null && last != null)
                    {
                        Assert.IsTrue(last.GetFrequency() >= tve.GetFrequency(), "terms are not properly sorted");
                        System.Int32 expectedFreq = (System.Int32)test4Map[tve.GetTerm()];
                        //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
                        Assert.IsTrue(tve.GetFrequency() == 2 * expectedFreq, "Frequency is not correct:");
                    }
                    last = tve;
                }

                FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
                knownSearcher.reader_ForNUnit.GetTermFreqVector(hits[1].doc, fieldMapper);
                System.Collections.IDictionary map = fieldMapper.GetFieldToTerms();
                Assert.IsTrue(map.Count == 2, "map Size: " + map.Count + " is not: " + 2);
                vectorEntrySet = (System.Collections.Generic.SortedDictionary <Object, Object>)map["field"];
                Assert.IsTrue(vectorEntrySet != null, "vectorEntrySet is null and it shouldn't be");
                Assert.IsTrue(vectorEntrySet.Count == 10, "vectorEntrySet Size: " + vectorEntrySet.Count + " is not: " + 10);
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
コード例 #14
0
        public virtual void  TestTermPositionVectors()
        {
            Query query = new TermQuery(new Term("field", "zero"));

            try
            {
                ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs;
                Assert.AreEqual(1, hits.Length);

                for (int i = 0; i < hits.Length; i++)
                {
                    TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].doc);
                    Assert.IsTrue(vector != null);
                    Assert.IsTrue(vector.Length == 1);

                    bool shouldBePosVector = (hits[i].doc % 2 == 0)?true:false;
                    Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true)));

                    bool shouldBeOffVector = (hits[i].doc % 3 == 0)?true:false;
                    Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true)));

                    if (shouldBePosVector || shouldBeOffVector)
                    {
                        TermPositionVector posVec = (TermPositionVector)vector[0];
                        System.String[]    terms  = posVec.GetTerms();
                        Assert.IsTrue(terms != null && terms.Length > 0);

                        for (int j = 0; j < terms.Length; j++)
                        {
                            int[] positions = posVec.GetTermPositions(j);
                            TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j);

                            if (shouldBePosVector)
                            {
                                Assert.IsTrue(positions != null);
                                Assert.IsTrue(positions.Length > 0);
                            }
                            else
                            {
                                Assert.IsTrue(positions == null);
                            }

                            if (shouldBeOffVector)
                            {
                                Assert.IsTrue(offsets != null);
                                Assert.IsTrue(offsets.Length > 0);
                            }
                            else
                            {
                                Assert.IsTrue(offsets == null);
                            }
                        }
                    }
                    else
                    {
                        try
                        {
                            TermPositionVector posVec = (TermPositionVector)vector[0];
                            Assert.IsTrue(false);
                        }
                        catch (System.InvalidCastException ignore)
                        {
                            TermFreqVector  freqVec = vector[0];
                            System.String[] terms   = freqVec.GetTerms();
                            Assert.IsTrue(terms != null && terms.Length > 0);
                        }
                    }
                }
            }
            catch (System.IO.IOException e)
            {
                Assert.IsTrue(false);
            }
        }
コード例 #15
0
        public static void  VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2)
        {
            if (d1 == null)
            {
                Assert.IsTrue(d2 == null);
                return;
            }
            Assert.IsTrue(d2 != null);

            Assert.AreEqual(d1.Length, d2.Length);
            for (int i = 0; i < d1.Length; i++)
            {
                TermFreqVector v1 = d1[i];
                TermFreqVector v2 = d2[i];
                if (v1 == null || v2 == null)
                {
                    System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length);
                }
                Assert.AreEqual(v1.Size(), v2.Size());
                int             numTerms = v1.Size();
                System.String[] terms1   = v1.GetTerms();
                System.String[] terms2   = v2.GetTerms();
                int[]           freq1    = v1.GetTermFrequencies();
                int[]           freq2    = v2.GetTermFrequencies();
                for (int j = 0; j < numTerms; j++)
                {
                    if (!terms1[j].Equals(terms2[j]))
                    {
                        Assert.AreEqual(terms1[j], terms2[j]);
                    }
                    Assert.AreEqual(freq1[j], freq2[j]);
                }
                if (v1 is TermPositionVector)
                {
                    Assert.IsTrue(v2 is TermPositionVector);
                    TermPositionVector tpv1 = (TermPositionVector)v1;
                    TermPositionVector tpv2 = (TermPositionVector)v2;
                    for (int j = 0; j < numTerms; j++)
                    {
                        int[] pos1 = tpv1.GetTermPositions(j);
                        int[] pos2 = tpv2.GetTermPositions(j);
                        Assert.AreEqual(pos1.Length, pos2.Length);
                        TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j);
                        TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j);
                        if (offsets1 == null)
                        {
                            Assert.IsTrue(offsets2 == null);
                        }
                        else
                        {
                            Assert.IsTrue(offsets2 != null);
                        }
                        for (int k = 0; k < pos1.Length; k++)
                        {
                            Assert.AreEqual(pos1[k], pos2[k]);
                            if (offsets1 != null)
                            {
                                Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset());
                                Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset());
                            }
                        }
                    }
                }
            }
        }
コード例 #16
0
ファイル: TermVectorsWriter.cs プロジェクト: emtees/old-code
		private void  AddTermFreqVectorInternal(TermFreqVector vector)
		{
			OpenField(vector.GetField());
			for (int i = 0; i < vector.Size(); i++)
			{
				AddTermInternal(vector.GetTerms()[i], vector.GetTermFrequencies()[i]);
			}
			CloseField();
		}
コード例 #17
0
ファイル: TestTermVectors.cs プロジェクト: yonder/mono
        public virtual void  TestKnownSetOfDocuments()
        {
            System.String[] termArray = new System.String[] { "eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored", "with", "an" };
            System.String   test1     = "eating chocolate in a computer lab";                                             //6 terms
            System.String   test2     = "computer in a computer lab";                                                     //5 terms
            System.String   test3     = "a chocolate lab grows old";                                                      //5 terms
            System.String   test4     = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
            System.Collections.IDictionary test4Map = new System.Collections.Hashtable();
            test4Map["chocolate"] = 3;
            test4Map["lab"]       = 2;
            test4Map["eating"]    = 1;
            test4Map["computer"]  = 1;
            test4Map["with"]      = 1;
            test4Map["a"]         = 1;
            test4Map["colored"]   = 1;
            test4Map["in"]        = 1;
            test4Map["an"]        = 1;
            test4Map["computer"]  = 1;
            test4Map["old"]       = 1;

            Document testDoc1 = new Document();

            SetupDoc(testDoc1, test1);
            Document testDoc2 = new Document();

            SetupDoc(testDoc2, test2);
            Document testDoc3 = new Document();

            SetupDoc(testDoc3, test3);
            Document testDoc4 = new Document();

            SetupDoc(testDoc4, test4);

            Directory dir = new RAMDirectory();

            try
            {
                IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
                Assert.IsTrue(writer != null);
                writer.AddDocument(testDoc1);
                writer.AddDocument(testDoc2);
                writer.AddDocument(testDoc3);
                writer.AddDocument(testDoc4);
                writer.Close();
                IndexSearcher knownSearcher = new IndexSearcher(dir);
                TermEnum      termEnum      = knownSearcher.reader.Terms();
                TermDocs      termDocs      = knownSearcher.reader.TermDocs();
                //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);

                Similarity sim = knownSearcher.GetSimilarity();
                while (termEnum.Next() == true)
                {
                    Term term = termEnum.Term();
                    //System.out.println("Term: " + term);
                    termDocs.Seek(term);
                    while (termDocs.Next())
                    {
                        int docId = termDocs.Doc();
                        int freq  = termDocs.Freq();
                        //System.out.println("Doc Id: " + docId + " freq " + freq);
                        TermFreqVector vector = knownSearcher.reader.GetTermFreqVector(docId, "Field");
                        float          tf     = sim.Tf(freq);
                        float          idf    = sim.Idf(term, knownSearcher);
                        //float qNorm = sim.queryNorm()
                        //This is fine since we don't have stop words
                        float lNorm = sim.LengthNorm("Field", vector.GetTerms().Length);
                        //float coord = sim.coord()
                        //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
                        Assert.IsTrue(vector != null);
                        System.String[] vTerms = vector.GetTerms();
                        int[]           freqs  = vector.GetTermFrequencies();
                        for (int i = 0; i < vTerms.Length; i++)
                        {
                            if (term.Text().Equals(vTerms[i]) == true)
                            {
                                Assert.IsTrue(freqs[i] == freq);
                            }
                        }
                    }
                    //System.out.println("--------");
                }
                Query query = new TermQuery(new Term("Field", "chocolate"));
                Hits  hits  = knownSearcher.Search(query);
                //doc 3 should be the first hit b/c it is the shortest match
                Assert.IsTrue(hits.Length() == 3);
                float score = hits.Score(0);

                /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
                 * System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
                 * System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " +  hits.doc(2).toString());
                 * System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
                Assert.IsTrue(testDoc3.ToString().Equals(hits.Doc(0).ToString()));
                Assert.IsTrue(testDoc4.ToString().Equals(hits.Doc(1).ToString()));
                Assert.IsTrue(testDoc1.ToString().Equals(hits.Doc(2).ToString()));
                TermFreqVector vector2 = knownSearcher.reader.GetTermFreqVector(hits.Id(1), "Field");
                Assert.IsTrue(vector2 != null);
                //System.out.println("Vector: " + vector);
                System.String[] terms  = vector2.GetTerms();
                int[]           freqs2 = vector2.GetTermFrequencies();
                Assert.IsTrue(terms != null && terms.Length == 10);
                for (int i = 0; i < terms.Length; i++)
                {
                    System.String term = terms[i];
                    //System.out.println("Term: " + term);
                    int freq = freqs2[i];
                    Assert.IsTrue(test4.IndexOf(term) != -1);
                    System.Int32  freqInt    = (System.Int32)test4Map[term];
                    System.Object tmpFreqInt = test4Map[term];
                    Assert.IsTrue(tmpFreqInt != null);
                    Assert.IsTrue(freqInt == freq);
                }
                knownSearcher.Close();
            }
            catch (System.IO.IOException e)
            {
                System.Console.Error.WriteLine(e.StackTrace);
                Assert.IsTrue(false);
            }
        }
コード例 #18
0
        public List <Post> Similar(int postid, int itemsToReturn)
        {
            List <Post> TList = new List <Post>();

            int docId = -1;

            IndexSearcher searcher = null;
            IndexReader   reader   = null;

            if (rd == null)
            {
                BuildIndex();
            }

            lck.AcquireReaderLock(ReaderTimeOut);
            try
            {
                Analyzer    analyzer = GetAnalyzer();
                QueryParser parser   = GetQueryParser(analyzer);
                parser.SetDefaultOperator(QueryParser.AND_OPERATOR);

                Query q = parser.Parse("postid:" + postid);

                searcher = new IndexSearcher(rd, true);
                //TODO
#pragma warning disable CS0618 // Type or member is obsolete
                Hits hits = searcher.Search(q);
#pragma warning restore CS0618 // Type or member is obsolete
                if (hits != null && hits.Length() > 0)
                {
                    docId = hits.Id(0);
                }

                if (docId > -1)
                {
                    reader = IndexReader.Open(rd, true);

                    TermFreqVector tfv          = reader.GetTermFreqVector(docId, "exact");
                    BooleanQuery   booleanQuery = new BooleanQuery();
                    for (int j = 0; j < tfv.Size(); j++)
                    {
                        TermQuery tq = new TermQuery(new Term("exact", tfv.GetTerms()[j]));
                        booleanQuery.Add(tq, BooleanClause.Occur.SHOULD);
                    }
                    //TODO
#pragma warning disable CS0618 // Type or member is obsolete
                    Hits similarhits = searcher.Search(booleanQuery, Sort.RELEVANCE);
#pragma warning restore CS0618 // Type or member is obsolete

                    for (int i = 0; i < similarhits.Length(); i++)
                    {
                        Document doc = similarhits.Doc(i);
                        if (similarhits.Id(i) != docId)
                        {
                            TList.Add(CreateFromDocument(doc, analyzer, null));
                        }

                        if (TList.Count >= itemsToReturn)
                        {
                            break;
                        }
                    }
                }
            }
            catch (Exception)
            {
            }
            finally
            {
                if (searcher != null)
                {
                    searcher.Close();
                }

                if (reader != null)
                {
                    reader.Close();
                }

                lck.ReleaseReaderLock();
            }



            return(TList);
        }
コード例 #19
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        /// 
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void AddAllDocVectors(TermFreqVector[] vectors)
        {
            OpenDocument();

            if (vectors != null)
            {
                for (int i = 0; i < vectors.Length; i++)
                {
                    bool storePositionWithTermVector = false;
                    bool storeOffsetWithTermVector = false;

                    if (vectors[i] is TermPositionVector)
                    {

                        TermPositionVector tpVector = (TermPositionVector) vectors[i];

                        if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null)
                            storePositionWithTermVector = true;
                        if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null)
                            storeOffsetWithTermVector = true;

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tpVector.Size(); j++)
                            AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j));

                        CloseField();
                    }
                    else
                    {

                        TermFreqVector tfVector = vectors[i];

                        FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField());
                        OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

                        for (int j = 0; j < tfVector.Size(); j++)
                            AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null);

                        CloseField();
                    }
                }
            }

            CloseDocument();
        }
コード例 #20
0
        /// <summary> Add a complete document specified by all its term vectors. If document has no
        /// term vectors, add value for tvx.
        /// 
        /// </summary>
        /// <param name="vectors">
        /// </param>
        /// <throws>  IOException </throws>
        public void AddAllDocVectors(TermFreqVector[] vectors)
        {
            tvx.WriteLong(tvd.GetFilePointer());
            tvx.WriteLong(tvf.GetFilePointer());

            if (vectors != null)
            {
                int numFields = vectors.Length;
                tvd.WriteVInt(numFields);

                long[] fieldPointers = new long[numFields];

                for (int i = 0; i < numFields; i++)
                {
                    fieldPointers[i] = tvf.GetFilePointer();

                    int fieldNumber = fieldInfos.FieldNumber(vectors[i].GetField());

                    // 1st pass: write field numbers to tvd
                    tvd.WriteVInt(fieldNumber);

                    int numTerms = vectors[i].Size();
                    tvf.WriteVInt(numTerms);

                    TermPositionVector tpVector;

                    byte bits;
                    bool storePositions;
                    bool storeOffsets;

                    if (vectors[i] is TermPositionVector)
                    {
                        // May have positions & offsets
                        tpVector = (TermPositionVector) vectors[i];
                        storePositions = tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null;
                        storeOffsets = tpVector.Size() > 0 && tpVector.GetOffsets(0) != null;
                        bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0));
                    }
                    else
                    {
                        tpVector = null;
                        bits = 0;
                        storePositions = false;
                        storeOffsets = false;
                    }

                    tvf.WriteVInt(bits);

                    System.String[] terms = vectors[i].GetTerms();
                    int[] freqs = vectors[i].GetTermFrequencies();

                    int utf8Upto = 0;
                    utf8Results[1].length = 0;

                    for (int j = 0; j < numTerms; j++)
                    {

                        UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);

                        int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);
                        int length = utf8Results[utf8Upto].length - start;
                        tvf.WriteVInt(start); // write shared prefix length
                        tvf.WriteVInt(length); // write delta length
                        tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
                        utf8Upto = 1 - utf8Upto;

                        int termFreq = freqs[j];

                        tvf.WriteVInt(termFreq);

                        if (storePositions)
                        {
                            int[] positions = tpVector.GetTermPositions(j);
                            if (positions == null)
                                throw new System.SystemException("Trying to write positions that are null!");
                            System.Diagnostics.Debug.Assert(positions.Length == termFreq);

                            // use delta encoding for positions
                            int lastPosition = 0;
                            for (int k = 0; k < positions.Length; k++)
                            {
                                int position = positions[k];
                                tvf.WriteVInt(position - lastPosition);
                                lastPosition = position;
                            }
                        }

                        if (storeOffsets)
                        {
                            TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
                            if (offsets == null)
                                throw new System.SystemException("Trying to write offsets that are null!");
                            System.Diagnostics.Debug.Assert(offsets.Length == termFreq);

                            // use delta encoding for offsets
                            int lastEndOffset = 0;
                            for (int k = 0; k < offsets.Length; k++)
                            {
                                int startOffset = offsets[k].GetStartOffset();
                                int endOffset = offsets[k].GetEndOffset();
                                tvf.WriteVInt(startOffset - lastEndOffset);
                                tvf.WriteVInt(endOffset - startOffset);
                                lastEndOffset = endOffset;
                            }
                        }
                    }
                }

                // 2nd pass: write field pointers to tvd
                if (numFields > 1)
                {
                    long lastFieldPointer = fieldPointers[0];
                    for (int i = 1; i < numFields; i++)
                    {
                        long fieldPointer = fieldPointers[i];
                        tvd.WriteVLong(fieldPointer - lastFieldPointer);
                        lastFieldPointer = fieldPointer;
                    }
                }
            }
            else
                tvd.WriteVInt(0);
        }
コード例 #21
0
ファイル: TermVectorsWriter.cs プロジェクト: emtees/old-code
		/// <summary>Add specified vector to the document. Document must be open but no Field
		/// should be open or exception is thrown. The same document can have <code>addTerm</code>
		/// and <code>addVectors</code> calls mixed, however a given Field must either be
		/// populated with <code>addTerm</code> or with <code>addVector</code>.     *
		/// </summary>
		public void  AddTermFreqVector(TermFreqVector vector)
		{
			if (!IsDocumentOpen())
				throw new System.SystemException("Cannot add term vector when document is not open");
			if (IsFieldOpen())
				throw new System.SystemException("Cannot add term vector when Field is open");
			AddTermFreqVectorInternal(vector);
		}
コード例 #22
0
        /// <summary> Retrieve the term vector for the given document and Field</summary>
        /// <param name="docNum">The document number to retrieve the vector for
        /// </param>
        /// <param name="Field">The Field within the document to retrieve
        /// </param>
        /// <returns> The TermFreqVector for the document and Field or null
        /// </returns>
        public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field)
        {
            lock (this)
            {
                // Check if no term vectors are available for this segment at all
                int            fieldNumber = fieldInfos.FieldNumber(field);
                TermFreqVector result      = null;
                if (tvx != null)
                {
                    try
                    {
                        //We need to account for the FORMAT_SIZE at when seeking in the tvx
                        //We don't need to do this in other seeks because we already have the file pointer
                        //that was written in another file
                        tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
                        //System.out.println("TVX Pointer: " + tvx.getFilePointer());
                        long position = tvx.ReadLong();

                        tvd.Seek(position);
                        int fieldCount = tvd.ReadVInt();
                        //System.out.println("Num Fields: " + fieldCount);
                        // There are only a few fields per document. We opt for a full scan
                        // rather then requiring that they be ordered. We need to read through
                        // all of the fields anyway to get to the tvf pointers.
                        int number = 0;
                        int found  = -1;
                        for (int i = 0; i < fieldCount; i++)
                        {
                            number += tvd.ReadVInt();
                            if (number == fieldNumber)
                            {
                                found = i;
                            }
                        }

                        // This Field, although valid in the segment, was not found in this document
                        if (found != -1)
                        {
                            // Compute position in the tvf file
                            position = 0;
                            for (int i = 0; i <= found; i++)
                            {
                                position += tvd.ReadVLong();
                            }
                            result = ReadTermVector(field, position);
                        }
                        else
                        {
                            //System.out.println("Field not found");
                        }
                    }
                    catch (System.Exception e)
                    {
                        //System.Console.Out.WriteLine(e.StackTrace);
                    }
                }
                else
                {
                    System.Console.Out.WriteLine("No tvx file");
                }
                return(result);
            }
        }
コード例 #23
0
ファイル: TermVectorsWriter.cs プロジェクト: emtees/old-code
		/// <summary>Add specified vectors to the document.</summary>
		public void  AddVectors(TermFreqVector[] vectors)
		{
			if (!IsDocumentOpen())
				throw new System.SystemException("Cannot add term vectors when document is not open");
			if (IsFieldOpen())
				throw new System.SystemException("Cannot add term vectors when Field is open");
			
			for (int i = 0; i < vectors.Length; i++)
			{
				AddTermFreqVector(vectors[i]);
			}
		}