/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public void ConstructorTest_Artifacts() { string data = @"../../Data/SimpleCorpus."; TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt")); TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt"); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
public void ConstructorTest_Artifacts() { string inputData = Settings.Default.SimpleCorpusDir; string outputData = Path.Combine(inputData, "TermDocumentMatrix"); TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt"))); TermDocumentMatrix answer = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt")); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }