/// <summary> /// Insert new documetn in storage. /// </summary> /// <param name="documentTermsData"></param> /// <returns>Return new Id of new document </returns> public string PostDocumentTerms(DocumentTermsData documentTermsData) { lock (_lockerDocumentTermsColl) { return(DocumentTermsColl.Insert(documentTermsData)); } }
/// <summary> /// Reduce number for each count in TermDocumentCountColl {term, count} /// Delete {Document, List<TermData>} from DocumentTermsColl /// ?Delete { term, documentId} from TermDocumentColl /// 2020-12-23T10:12:38 /// </summary> /// <param name="documentId"></param> /// <returns></returns> public bool DeleteDocument(string documentId) { lock (_lockerDocumentTermsColl) { DocumentTermsData documentTermsData = DocumentTermsColl.FindOne(d => d.Document == documentId); if (documentTermsData != null) { //Reduce number for each count in TermDocumentCountColl {term, count} lock (_lockerTermDocumentCountColl) { foreach (TermData termData in documentTermsData.Terms) { TermDocumentCountData termDocumentCountData = TermDocumentCountColl.FindOne(d => d.Term == termData.Term); if (termDocumentCountData != null) { termDocumentCountData.Count--; TermDocumentCountColl.Update(termDocumentCountData); } } //Delete {Document, List<TermData>} from DocumentTermsColl return(DocumentTermsColl.Delete(documentTermsData.Id)); } } return(false); } }
/// <summary> /// Adds a new document along with its terms to the database. /// </summary> /// <param name="document"></param> /// <param name="documentTerms"></param> public void AddDocument(string documentId, List <TermData> documentTerms) { DocumentTermsData documentTermsData = new DocumentTermsData() { Document = documentId, Terms = documentTerms }; Storage.PostDocumentTerms(documentTermsData); Storage.PutTermDocumentCounts(documentTerms); }
/// <summary> /// Gets the cosine similarity of vectors of keywords of two documents. /// Tf-Idf and Cosine similarity: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/ /// </summary> /// <param name="documentId1"></param> /// <param name="documentId2"></param> /// <param name="tfIdfEstimator"></param> /// <returns>Cosine similarity</returns> public static double GetDocumentSimilarityExt(DocumentTermsData documentTermsData1, string documentId2, TfIdfEstimatorExt tfIdfEstimator) { //using var db = new LiteDatabase(tfIdfEstimator.TfIdfStorage.ConnectionString); //var documentTermsColl = db.GetCollection<DocumentTermsData>(tfIdfEstimator.TfIdfStorage.DocumentTermsColl); // Get all keywords from the two documents and make a union of them. DocumentTermsData doc1 = documentTermsData1;//2020-12-28T08:49:23 tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1); DocumentTermsData documentTermsData2 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId2); List <TermData> terms1 = doc1.Terms; List <TermData> terms2 = documentTermsData2.Terms; var set = new HashSet <string>(); foreach (var keyword in terms1) { set.Add(keyword.Term); } foreach (var keyword in terms2) { set.Add(keyword.Term); } // Get term scores of keywords in a union for both documents. List <double> list1 = new List <double>(); List <double> list2 = new List <double>(); foreach (var keyword in set) { list1.Add(tfIdfEstimator.GetOneTermInDocument(doc1.Document, keyword).TermScore); list2.Add(tfIdfEstimator.GetOneTermInDocument(documentId2, keyword).TermScore); } // Calculate the cosine similarity. CosSimilarity(v1, v2) = dot(v1,v2) / (norm(v1) * norm(v2)) where v1 and v2 are vectors double dot = DotProduct(list1, list2); double norm1 = 0; double norm2 = 0; foreach (var item in list1) { norm1 += Math.Pow(item, 2); } foreach (var item in list2) { norm2 += Math.Pow(item, 2); } norm1 = Math.Sqrt(norm1); norm2 = Math.Sqrt(norm2); double similarity = (norm1 * norm2 == 0 ? 0 : dot / (norm1 * norm2)); return(similarity); }
/// <summary> /// Returns top N most similar documents to the first document along with its cosine similarities. /// </summary> /// <param name="documentId1"></param> /// <param name="numberOfDocuments"></param> /// <param name="tfIdfEstimator"></param> /// <returns>List of most similar documents</returns> public static List <DocumentSimilarityScoreData> GetSimilarDocumentsOLD1(string documentId1, int numberOfDocuments, TfIdfEstimatorExt tfIdfEstimator) { IEnumerable <DocumentTermsData> docs = tfIdfEstimator.Storage.DocumentTermsColl.FindAll();//.ToList();//TODO: optimize it!!! - 2020-12-22T10:04:31 DocumentTermsData documentTermsData1 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1); // Get similarity scores of all those documents. var docScores = new List <DocumentSimilarityScoreData>(); foreach (DocumentTermsData documentTermsData in docs) { if (documentTermsData1.Document == documentTermsData.Document) { continue;//skip it } docScores.Add(new DocumentSimilarityScoreData() { Document = documentTermsData.Document, Score = GetDocumentSimilarityExt(documentTermsData1, documentTermsData.Document, tfIdfEstimator) }); } //Order them by their scores descending and return top N of them. docScores.OrderByDescending(x => x.Score); return(docScores.GetRange(0, numberOfDocuments)); /*OLD CODE: 2020-12-28T08:57:09 * // Get all docs except the one that we are looking the most similar to. * var docs = tfIdfEstimator.Storage.DocumentTermsColl.FindAll().ToList();//TODO: optimize it!!! - 2020-12-22T10:04:31 * docs.Remove(docs.Find(x => x.Document == document1)); * * // Get similarity scores of all those documents. * var docScores = new List<DocumentSimilarityScoreData>(); * foreach (var doc in docs) * { * docScores.Add(new DocumentSimilarityScoreData() * { * Document = doc.Document, * Score = GetDocumentSimilarityExt(document1, doc.Document, tfIdfEstimator) * }); * } * * //Order them by their scores descending and return top N of them. * docScores.OrderByDescending(x => x.Score); * return docScores.GetRange(0, numberOfDocuments); */ }
/// <summary> /// Takes a names of the document from which to get a specific term with its tf-idf value. /// </summary> /// <param name="documentId"></param> /// <param name="term"></param> /// <returns>Returns a TermsScoreData object</returns> public TermScoreData GetOneTermInDocument(string documentId, string term) { DocumentTermsData documentTermsData = Storage.DocumentTermsColl.FindOne(x => x.Document == documentId); long countOfTerms = documentTermsData.Terms.Sum(x => x.Count); var term2 = documentTermsData.Terms.Find(x => x.Term == term); long countOfTerm = (term2 == null ? 0 : term2.Count); double termFrequency = (countOfTerms == 0 ? 0 : countOfTerm / (double)countOfTerms); int countOfDocs = Storage.DocumentTermsColl.Count(); long countOfDocsWithTerm = Storage.TermDocumentCountColl.FindOne(x => x.Term == term).Count; double inverseDocumentFrequencySmooth = Math.Log10(countOfDocs / (double)(countOfDocsWithTerm + 1d)) + 1d; double tfidfValue = termFrequency * inverseDocumentFrequencySmooth; var tsd = new TermScoreData { Term = term, TermScore = tfidfValue }; return(tsd); }
/// <summary> /// Returns top N most similar documents to the first document along with its cosine similarities. /// </summary> /// <param name="documentId1"></param> /// <param name="numberOfDocuments"></param> /// <param name="tfIdfEstimator"></param> /// <returns>List of most similar documents</returns> public static List <DocumentSimilarityScoreData> GetSimilarDocuments(string documentId1, int numberOfDocuments, TfIdfEstimatorExt tfIdfEstimator) { DocumentTermsData documentTermsData1 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1); //bool Compare(string term)//local function //{ // TermData termData = documentTermsData1.Terms.Find(t => t.Term == term); // if (termData != null) // { // return true; // } // return false; //} //Func<string, bool> Predicate = Compare; //Example using multi key index https://github.com/mbdavid/LiteDB/blob/master/LiteDB.Tests/Database/MultiKey_Mapper_Tests.cs IEnumerable <DocumentTermsData> docs = tfIdfEstimator.Storage.DocumentTermsColl.Find(d => d.Terms.Select(z => z.Term).Any(x => Compare(documentTermsData1.Terms, x))); //IEnumerable<DocumentTermsData> docs = tfIdfEstimator.Storage.DocumentTermsColl.FindAll();//.ToList();//optimize it!!! - 2020-12-22T10:04:31 // Get similarity scores of all those documents. var docScores = new List <DocumentSimilarityScoreData>(); foreach (DocumentTermsData documentTermsData in docs) { if (documentTermsData1.Document == documentTermsData.Document) { continue;//skip it } docScores.Add(new DocumentSimilarityScoreData() { Document = documentTermsData.Document, Score = GetDocumentSimilarityExt(documentTermsData1, documentTermsData.Document, tfIdfEstimator) }); } //Order them by their scores descending and return top N of them. docScores.OrderByDescending(x => x.Score); return(docScores.GetRange(0, numberOfDocuments)); }
public static double GetDocumentSimilarityExt(string documentId1, string documentId2, TfIdfEstimatorExt tfIdfEstimator) { DocumentTermsData documentTermsData1 = tfIdfEstimator.Storage.DocumentTermsColl.FindOne(x => x.Document == documentId1); return(GetDocumentSimilarityExt(documentTermsData1, documentId2, tfIdfEstimator)); }