예제 #1
0
 /// <summary>
 /// Insert new documetn in storage.
 /// </summary>
 /// <param name="documentTermsData"></param>
 /// <returns>Return new Id of new document </returns>
 public string PostDocumentTerms(DocumentTermsData documentTermsData)
 {
     lock (_lockerDocumentTermsColl)
     {
         return(DocumentTermsColl.Insert(documentTermsData));
     }
 }
예제 #2
0
 /// <summary>
 /// Reduce number for each count in TermDocumentCountColl {term, count}
 /// Delete  {Document, List<TermData>} from  DocumentTermsColl
 /// ?Delete { term, documentId} from TermDocumentColl
 /// 2020-12-23T10:12:38
 /// </summary>
 /// <param name="documentId"></param>
 /// <returns></returns>
 public bool DeleteDocument(string documentId)
 {
     lock (_lockerDocumentTermsColl)
     {
         DocumentTermsData documentTermsData = DocumentTermsColl.FindOne(d => d.Document == documentId);
         if (documentTermsData != null)
         {
             //Reduce number for each count in TermDocumentCountColl {term, count}
             lock (_lockerTermDocumentCountColl)
             {
                 foreach (TermData termData in documentTermsData.Terms)
                 {
                     TermDocumentCountData termDocumentCountData = TermDocumentCountColl.FindOne(d => d.Term == termData.Term);
                     if (termDocumentCountData != null)
                     {
                         termDocumentCountData.Count--;
                         TermDocumentCountColl.Update(termDocumentCountData);
                     }
                 }
                 //Delete  {Document, List<TermData>} from  DocumentTermsColl
                 return(DocumentTermsColl.Delete(documentTermsData.Id));
             }
         }
         return(false);
     }
 }
예제 #3
0
        /// <summary>
        /// Adds a new document along with its terms to the database.
        /// </summary>
        /// <param name="document"></param>
        /// <param name="documentTerms"></param>
        public void AddDocument(string documentId, List <TermData> documentTerms)
        {
            DocumentTermsData documentTermsData = new DocumentTermsData()
            {
                Document = documentId,
                Terms    = documentTerms
            };

            Storage.PostDocumentTerms(documentTermsData);
            Storage.PutTermDocumentCounts(documentTerms);
        }
        /// <summary>
        /// Gets the cosine similarity of vectors of keywords of two documents.
        /// Tf-Idf and Cosine similarity: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
        /// </summary>
        /// <param name="documentId1"></param>
        /// <param name="documentId2"></param>
        /// <param name="tfIdfEstimator"></param>
        /// <returns>Cosine similarity</returns>
        public static double GetDocumentSimilarityExt(DocumentTermsData documentTermsData1, string documentId2, TfIdfEstimatorExt tfIdfEstimator)
        {
            //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString);
            //var documentTermsColl = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl);

            // Get all keywords from the two documents and make a union of them.
            DocumentTermsData doc1 = documentTermsData1;//2020-12-28T08:49:23 tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1);
            DocumentTermsData documentTermsData2 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId2);
            List <TermData>   terms1             = doc1.Terms;
            List <TermData>   terms2             = documentTermsData2.Terms;
            var set = new HashSet <string>();

            foreach (var keyword in terms1)
            {
                set.Add(keyword.Term);
            }

            foreach (var keyword in terms2)
            {
                set.Add(keyword.Term);
            }

            // Get term scores of keywords in a union for both documents.
            List <double> list1 = new List <double>();
            List <double> list2 = new List <double>();

            foreach (var keyword in set)
            {
                list1.Add(tfIdfEstimator.GetOneTermInDocument(doc1.Document, keyword).TermScore);
                list2.Add(tfIdfEstimator.GetOneTermInDocument(documentId2, keyword).TermScore);
            }

            // Calculate the cosine similarity. CosSimilarity(v1, v2) = dot(v1,v2) / (norm(v1) * norm(v2)) where v1 and v2 are vectors
            double dot   = DotProduct(list1, list2);
            double norm1 = 0;
            double norm2 = 0;

            foreach (var item in list1)
            {
                norm1 += Math.Pow(item, 2);
            }

            foreach (var item in list2)
            {
                norm2 += Math.Pow(item, 2);
            }
            norm1 = Math.Sqrt(norm1);
            norm2 = Math.Sqrt(norm2);
            double similarity = (norm1 * norm2 == 0 ? 0 : dot / (norm1 * norm2));

            return(similarity);
        }
        /// <summary>
        /// Returns top N most similar documents to the first document along with its cosine similarities.
        /// </summary>
        /// <param name="documentId1"></param>
        /// <param name="numberOfDocuments"></param>
        /// <param name="tfIdfEstimator"></param>
        /// <returns>List of most similar documents</returns>
        public static List <DocumentSimilarityScoreData> GetSimilarDocumentsOLD1(string documentId1, int numberOfDocuments, TfIdfEstimatorExt tfIdfEstimator)
        {
            IEnumerable <DocumentTermsData> docs = tfIdfEstimator.Storage.DocumentTermsColl.FindAll();//.ToList();//TODO: optimize it!!! - 2020-12-22T10:04:31
            DocumentTermsData documentTermsData1 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1);

            // Get similarity scores of all those documents.
            var docScores = new List <DocumentSimilarityScoreData>();

            foreach (DocumentTermsData documentTermsData in docs)
            {
                if (documentTermsData1.Document == documentTermsData.Document)
                {
                    continue;//skip it
                }

                docScores.Add(new DocumentSimilarityScoreData()
                {
                    Document = documentTermsData.Document,
                    Score    = GetDocumentSimilarityExt(documentTermsData1, documentTermsData.Document, tfIdfEstimator)
                });
            }

            //Order them by their scores descending and return top N of them.
            docScores.OrderByDescending(x => x.Score);
            return(docScores.GetRange(0, numberOfDocuments));

            /*OLD CODE: 2020-12-28T08:57:09
             * // Get all docs except the one that we are looking the most similar to.
             * var docs = tfIdfEstimator.Storage.DocumentTermsColl.FindAll().ToList();//TODO: optimize it!!! - 2020-12-22T10:04:31
             * docs.Remove(docs.Find(x => x.Document == document1));
             *
             * // Get similarity scores of all those documents.
             * var docScores = new List<DocumentSimilarityScoreData>();
             * foreach (var doc in docs)
             * {
             *  docScores.Add(new DocumentSimilarityScoreData()
             *  {
             *      Document = doc.Document,
             *      Score = GetDocumentSimilarityExt(document1, doc.Document, tfIdfEstimator)
             *  });
             * }
             *
             * //Order them by their scores descending and return top N of them.
             * docScores.OrderByDescending(x => x.Score);
             * return docScores.GetRange(0, numberOfDocuments);
             */
        }
예제 #6
0
        /// <summary>
        /// Takes a names of the document from which to get a specific term with its tf-idf value.
        /// </summary>
        /// <param name="documentId"></param>
        /// <param name="term"></param>
        /// <returns>Returns a TermsScoreData object</returns>
        public TermScoreData GetOneTermInDocument(string documentId, string term)
        {
            DocumentTermsData documentTermsData = Storage.DocumentTermsColl.FindOne(x => x.Document == documentId);
            long   countOfTerms  = documentTermsData.Terms.Sum(x => x.Count);
            var    term2         = documentTermsData.Terms.Find(x => x.Term == term);
            long   countOfTerm   = (term2 == null ? 0 : term2.Count);
            double termFrequency = (countOfTerms == 0 ? 0 : countOfTerm / (double)countOfTerms);

            int    countOfDocs                    = Storage.DocumentTermsColl.Count();
            long   countOfDocsWithTerm            = Storage.TermDocumentCountColl.FindOne(x => x.Term == term).Count;
            double inverseDocumentFrequencySmooth = Math.Log10(countOfDocs / (double)(countOfDocsWithTerm + 1d)) + 1d;

            double tfidfValue = termFrequency * inverseDocumentFrequencySmooth;
            var    tsd        = new TermScoreData
            {
                Term      = term,
                TermScore = tfidfValue
            };

            return(tsd);
        }
        /// <summary>
        /// Returns top N most similar documents to the first document along with its cosine similarities.
        /// </summary>
        /// <param name="documentId1"></param>
        /// <param name="numberOfDocuments"></param>
        /// <param name="tfIdfEstimator"></param>
        /// <returns>List of most similar documents</returns>
        public static List <DocumentSimilarityScoreData> GetSimilarDocuments(string documentId1, int numberOfDocuments, TfIdfEstimatorExt tfIdfEstimator)
        {
            DocumentTermsData documentTermsData1 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1);

            //bool Compare(string term)//local function
            //{
            //    TermData termData = documentTermsData1.Terms.Find(t => t.Term == term);
            //    if (termData != null)
            //    {
            //        return true;
            //    }
            //    return false;
            //}
            //Func<string, bool> Predicate = Compare;

            //Example using multi key index https://github.com/mbdavid/LiteDB/blob/master/LiteDB.Tests/Database/MultiKey_Mapper_Tests.cs
            IEnumerable <DocumentTermsData> docs = tfIdfEstimator.Storage.DocumentTermsColl.Find(d => d.Terms.Select(z => z.Term).Any(x => Compare(documentTermsData1.Terms, x)));
            //IEnumerable<DocumentTermsData> docs = tfIdfEstimator.Storage.DocumentTermsColl.FindAll();//.ToList();//optimize it!!! - 2020-12-22T10:04:31

            // Get similarity scores of all those documents.
            var docScores = new List <DocumentSimilarityScoreData>();

            foreach (DocumentTermsData documentTermsData in docs)
            {
                if (documentTermsData1.Document == documentTermsData.Document)
                {
                    continue;//skip it
                }

                docScores.Add(new DocumentSimilarityScoreData()
                {
                    Document = documentTermsData.Document,
                    Score    = GetDocumentSimilarityExt(documentTermsData1, documentTermsData.Document, tfIdfEstimator)
                });
            }

            //Order them by their scores descending and return top N of them.
            docScores.OrderByDescending(x => x.Score);
            return(docScores.GetRange(0, numberOfDocuments));
        }
        public static double GetDocumentSimilarityExt(string documentId1, string documentId2, TfIdfEstimatorExt tfIdfEstimator)
        {
            DocumentTermsData documentTermsData1 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1);

            return(GetDocumentSimilarityExt(documentTermsData1, documentId2, tfIdfEstimator));
        }