Exemplo n.º 1
0
        public void TwoDocumentsSimilarity1Test()
        {
            TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N"));
            //TfIdfEstimator tfIdfEstimator = new TfIdfEstimator();
            //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString);
            //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl);
            //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl);
            //coll.DeleteAll();
            //coll2.DeleteAll();

            var terms1 = new List <TermData>();

            var terms2 = new List <TermData>()
            {
                new TermData()
                {
                    Term = banana, Count = 1
                },
                new TermData()
                {
                    Term = blueberry, Count = 5
                }
            };

            tfIdfEstimator.AddDocument(docName1, terms1);
            tfIdfEstimator.AddDocument(docName2, terms2);

            double similarity = tfIdfEstimator.GetDocumentSimilarity(docName1, docName2);

            Assert.True(similarity == 0);
            File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases);
        }
Exemplo n.º 2
0
        public void SearchTest()
        {
            TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N"));
            //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString);
            //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl);
            //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl);
            //
            //coll.DeleteAll();
            //coll2.DeleteAll();

            var terms1 = new List <TermData>()
            {
                new TermData()
                {
                    Term = banana, Count = 1
                },
                new TermData()
                {
                    Term = apple, Count = 2
                },
            };

            var terms2 = new List <TermData>()
            {
                new TermData()
                {
                    Term = banana, Count = 1
                },
                new TermData()
                {
                    Term = blueberry, Count = 5
                }
            };

            var terms3 = new List <TermData>()
            {
                new TermData()
                {
                    Term = strawberry, Count = 3
                },
            };

            tfIdfEstimator.AddDocument(docName1, terms1);
            tfIdfEstimator.AddDocument(docName2, terms2);
            tfIdfEstimator.AddDocument(docName3, terms3);

            var docs = tfIdfEstimator.Search(banana, 10);

            Assert.True(docs.Count == 2);
            Assert.True(docs[0] == docName1);
            Assert.True(docs[1] == docName2);

            File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases);
        }
Exemplo n.º 3
0
        public void AddGetDocumentTest()
        {
            TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N"));

            var terms = new List <TermData>()
            {
                new TermData()
                {
                    Term = banana, Count = 1
                },
                new TermData()
                {
                    Term = apple, Count = 2
                },
                new TermData()
                {
                    Term = strawberry, Count = 3
                },
                new TermData()
                {
                    Term = cherry, Count = 4
                },
                new TermData()
                {
                    Term = blueberry, Count = 5
                }
            };

            tfIdfEstimator.AddDocument(docName1, terms);

            //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString);
            //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl);
            //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl);
            //int count = coll.Count();
            var docterms = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == docName1);

            for (int i = 0; i < terms.Count; i++)
            {
                Assert.True(docterms.Terms[i].Term == terms[i].Term);
            }
            //coll.DeleteAll();
            //coll2.DeleteAll();
            //Delete Database - in case of LiteDB we delete one file.
            File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases);
        }
        public void AddGetDocumentTest()
        {
            TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N"));
            string            docName        = "TestDoc";

            var terms = new List <TermData>()
            {
                new TermData()
                {
                    Term = "banana", Count = 1
                },
                new TermData()
                {
                    Term = "apple", Count = 2
                },
                new TermData()
                {
                    Term = "strawberry", Count = 3
                },
                new TermData()
                {
                    Term = "cherry", Count = 4
                },
                new TermData()
                {
                    Term = "bluberry", Count = 5
                }
            };

            tfIdfEstimator.AddDocument(docName, terms);

            DocumentTermsData documentTermsData = tfIdfEstimator.GetDocument(docName);

            for (int i = 0; i < documentTermsData.Terms.Count; i++)
            {
                Assert.True(documentTermsData.Terms[i].Term == terms[i].Term);
            }
            //TODO: check is all term exist in TermDocumentCountColl - 2020-12-22T09:32:03

            //Delete Database - in case of LiteDB we delete one file.
            File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases);
        }
Exemplo n.º 5
0
        public void TfIdfTest()
        {
            TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N"));
            var terms1 = new List <TermData>()
            {
                new TermData()
                {
                    Term = banana, Count = 1
                },
                new TermData()
                {
                    Term = apple, Count = 2
                },
            };

            var terms2 = new List <TermData>()
            {
                new TermData()
                {
                    Term = banana, Count = 1
                },
                new TermData()
                {
                    Term = blueberry, Count = 5
                }
            };

            var terms3 = new List <TermData>()
            {
                new TermData()
                {
                    Term = strawberry, Count = 3
                },
            };

            tfIdfEstimator.AddDocument(docName1, terms1);
            tfIdfEstimator.AddDocument(docName2, terms2);
            tfIdfEstimator.AddDocument(docName3, terms3);

            var t1 = tfIdfEstimator.GetAllTermsInDocument(docName1);
            var t2 = tfIdfEstimator.GetAllTermsInDocument(docName2);
            var t3 = tfIdfEstimator.GetAllTermsInDocument(docName3);

            double banana1 = (1d / 3d) * (Math.Log10(3d / (2d + 1d)) + 1d);
            double apple1  = (2d / 3d) * (Math.Log10(3d / (1d + 1d)) + 1d);

            double banana2    = (1d / 6d) * (Math.Log10(3d / (2d + 1d)) + 1d);
            double blueberry2 = (5d / 6d) * (Math.Log10(3d / (1d + 1d)) + 1d);

            double strawberry3 = (1d / 1d) * (Math.Log10(3d / (1d + 1d)) + 1d);

            Assert.True(t1[0].TermScore == banana1);
            Assert.True(t1[1].TermScore == apple1);
            Assert.True(t2[0].TermScore == banana2);
            Assert.True(t2[1].TermScore == blueberry2);
            Assert.True(t3[0].TermScore == strawberry3);

            //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString);
            //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl);
            //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl);
            //coll.DeleteAll();
            //coll2.DeleteAll();
            File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases);
        }