/// <summary> /// Computes the cosine similarity between the given document pairs in the matrix /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="sourceIDs">Collection of source artifacts ids</param> /// /// <param name="targetIDs">Collection of target artifacts ids</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (string sourceID in sourceIDs) { double[] sourceDoc = matrix.GetDocument(sourceID); foreach (string targetID in targetIDs) { // compute cosine similarity between source and target double[] targetDoc = matrix.GetDocument(targetID); double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc); if (lengthProduct == 0.0) { sims.AddLink(sourceID, targetID, 0.0); } else { double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct; sims.AddLink(sourceID, targetID, score); } } } return(sims); }
/// <summary> /// Computes the term frequencies of each document. /// Each term in a vector is divided by the max term in that vector. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>tf-weighted term-by-document matrix</returns> public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { double max = matrix.GetDocument(i).Max(); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = matrix[i, j] / max; } } return(matrix); }
public void ConstructorTest_Artifacts() { string data = @"../../Data/SimpleCorpus."; TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt")); TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt"); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
public void ConstructorTest_Artifacts() { string inputData = Settings.Default.SimpleCorpusDir; string outputData = Path.Combine(inputData, "TermDocumentMatrix"); TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt"))); TermDocumentMatrix answer = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt")); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
/// <summary> /// Computes the cosine similarity between the given document pairs in the matrix /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="sourceIDs">Collection of source artifacts ids</param> /// /// <param name="targetIDs">Collection of target artifacts ids</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (string sourceID in sourceIDs) { double[] sourceDoc = matrix.GetDocument(sourceID); foreach (string targetID in targetIDs) { // compute cosine similarity between source and target double[] targetDoc = matrix.GetDocument(targetID); double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc); if (lengthProduct == 0.0) { sims.AddLink(sourceID, targetID, 0.0); } else { double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct; sims.AddLink(sourceID, targetID, score); } } } return sims; }
/// <summary> /// Computes the term frequencies of each document. /// Each term in a vector is divided by the max term in that vector. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>tf-weighted term-by-document matrix</returns> public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { double max = matrix.GetDocument(i).Max(); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = matrix[i, j] / max; } } return matrix; }