public void SearchTest() { TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N")); //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString); //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl); //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl); // //coll.DeleteAll(); //coll2.DeleteAll(); var terms1 = new List <TermData>() { new TermData() { Term = banana, Count = 1 }, new TermData() { Term = apple, Count = 2 }, }; var terms2 = new List <TermData>() { new TermData() { Term = banana, Count = 1 }, new TermData() { Term = blueberry, Count = 5 } }; var terms3 = new List <TermData>() { new TermData() { Term = strawberry, Count = 3 }, }; tfIdfEstimator.AddDocument(docName1, terms1); tfIdfEstimator.AddDocument(docName2, terms2); tfIdfEstimator.AddDocument(docName3, terms3); var docs = tfIdfEstimator.Search(banana, 10); Assert.True(docs.Count == 2); Assert.True(docs[0] == docName1); Assert.True(docs[1] == docName2); File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases); }
public void TwoDocumentsSimilarity3Test() { TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N")); //TfIdfEstimator tfIdfEstimator = new TfIdfEstimator(); //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString); //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl); //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl); //coll.DeleteAll(); //coll2.DeleteAll(); var terms1 = new List <TermData>() { new TermData() { Term = banana, Count = 2 }, new TermData() { Term = blueberry, Count = 3 }, new TermData() { Term = apple, Count = 7 } }; var terms2 = new List <TermData>() { new TermData() { Term = apple, Count = 7 }, new TermData() { Term = banana, Count = 2 }, new TermData() { Term = blueberry, Count = 3 }, }; tfIdfEstimator.AddDocument(docName1, terms1); tfIdfEstimator.AddDocument(docName2, terms2); double similarity = tfIdfEstimator.GetDocumentSimilarity(docName1, docName2); Assert.True(similarity == 1); File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases); }
public void AddGetDocumentTest() { TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N")); var terms = new List <TermData>() { new TermData() { Term = banana, Count = 1 }, new TermData() { Term = apple, Count = 2 }, new TermData() { Term = strawberry, Count = 3 }, new TermData() { Term = cherry, Count = 4 }, new TermData() { Term = blueberry, Count = 5 } }; tfIdfEstimator.AddDocument(docName1, terms); //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString); //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl); //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl); //int count = coll.Count(); var docterms = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == docName1); for (int i = 0; i < terms.Count; i++) { Assert.True(docterms.Terms[i].Term == terms[i].Term); } //coll.DeleteAll(); //coll2.DeleteAll(); //Delete Database - in case of LiteDB we delete one file. File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases); }
public void AddGetDocumentTest() { TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N")); string docName = "TestDoc"; var terms = new List <TermData>() { new TermData() { Term = "banana", Count = 1 }, new TermData() { Term = "apple", Count = 2 }, new TermData() { Term = "strawberry", Count = 3 }, new TermData() { Term = "cherry", Count = 4 }, new TermData() { Term = "bluberry", Count = 5 } }; tfIdfEstimator.AddDocument(docName, terms); DocumentTermsData documentTermsData = tfIdfEstimator.GetDocument(docName); for (int i = 0; i < documentTermsData.Terms.Count; i++) { Assert.True(documentTermsData.Terms[i].Term == terms[i].Term); } //TODO: check is all term exist in TermDocumentCountColl - 2020-12-22T09:32:03 //Delete Database - in case of LiteDB we delete one file. File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases); }
public void TfIdfTest() { TfIdfEstimatorExt tfIdfEstimator = new TfIdfEstimatorExt(Guid.NewGuid().ToString("N")); var terms1 = new List <TermData>() { new TermData() { Term = banana, Count = 1 }, new TermData() { Term = apple, Count = 2 }, }; var terms2 = new List <TermData>() { new TermData() { Term = banana, Count = 1 }, new TermData() { Term = blueberry, Count = 5 } }; var terms3 = new List <TermData>() { new TermData() { Term = strawberry, Count = 3 }, }; tfIdfEstimator.AddDocument(docName1, terms1); tfIdfEstimator.AddDocument(docName2, terms2); tfIdfEstimator.AddDocument(docName3, terms3); var t1 = tfIdfEstimator.GetAllTermsInDocument(docName1); var t2 = tfIdfEstimator.GetAllTermsInDocument(docName2); var t3 = tfIdfEstimator.GetAllTermsInDocument(docName3); double banana1 = (1d / 3d) * (Math.Log10(3d / (2d + 1d)) + 1d); double apple1 = (2d / 3d) * (Math.Log10(3d / (1d + 1d)) + 1d); double banana2 = (1d / 6d) * (Math.Log10(3d / (2d + 1d)) + 1d); double blueberry2 = (5d / 6d) * (Math.Log10(3d / (1d + 1d)) + 1d); double strawberry3 = (1d / 1d) * (Math.Log10(3d / (1d + 1d)) + 1d); Assert.True(t1[0].TermScore == banana1); Assert.True(t1[1].TermScore == apple1); Assert.True(t2[0].TermScore == banana2); Assert.True(t2[1].TermScore == blueberry2); Assert.True(t3[0].TermScore == strawberry3); //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString); //var coll = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl); //var coll2 = db.GetCollection<TermDocumentCountData>(tfIdfEstimator.TfIdfStorage.TermDocumentCountColl); //coll.DeleteAll(); //coll2.DeleteAll(); File.Delete(tfIdfEstimator.Storage.PathDirRootDataBases); }