/// <summary>
        /// Computes the cosine similarity between the given document pairs in the matrix
        /// </summary>
        /// <param name="matrix">Term-by-document matrix</param>
        /// <param name="sourceIDs">Collection of source artifacts ids</param>
        /// /// <param name="targetIDs">Collection of target artifacts ids</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs)
        {
            TLSimilarityMatrix sims = new TLSimilarityMatrix();

            foreach (string sourceID in sourceIDs)
            {
                double[] sourceDoc = matrix.GetDocument(sourceID);
                foreach (string targetID in targetIDs)
                {
                    // compute cosine similarity between source and target
                    double[] targetDoc     = matrix.GetDocument(targetID);
                    double   lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc);
                    if (lengthProduct == 0.0)
                    {
                        sims.AddLink(sourceID, targetID, 0.0);
                    }
                    else
                    {
                        double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct;
                        sims.AddLink(sourceID, targetID, score);
                    }
                }
            }
            return(sims);
        }
Beispiel #2
0
 /// <summary>
 /// Computes the term frequencies of each document.
 /// Each term in a vector is divided by the max term in that vector.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>tf-weighted term-by-document matrix</returns>
 public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         double max = matrix.GetDocument(i).Max();
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = matrix[i, j] / max;
         }
     }
     return(matrix);
 }
        public void ConstructorTest_Artifacts()
        {
            string             data   = @"../../Data/SimpleCorpus.";
            TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt"));
            TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt");

            // counts
            Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
            Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
            // matrix
            for (int i = 0; i < answer.NumDocs; i++)
            {
                Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
                Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
                for (int j = 0; j < answer.NumTerms; j++)
                {
                    Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
                    Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
                }
            }
        }
        public void ConstructorTest_Artifacts()
        {
            string             inputData  = Settings.Default.SimpleCorpusDir;
            string             outputData = Path.Combine(inputData, "TermDocumentMatrix");
            TermDocumentMatrix matrix     = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt")));
            TermDocumentMatrix answer     = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt"));

            // counts
            Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
            Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
            // matrix
            for (int i = 0; i < answer.NumDocs; i++)
            {
                Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
                Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
                for (int j = 0; j < answer.NumTerms; j++)
                {
                    Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
                    Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
                }
            }
        }
Beispiel #5
0
 /// <summary>
 /// Computes the cosine similarity between the given document pairs in the matrix
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <param name="sourceIDs">Collection of source artifacts ids</param>
 /// /// <param name="targetIDs">Collection of target artifacts ids</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs)
 {
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     foreach (string sourceID in sourceIDs)
     {
         double[] sourceDoc = matrix.GetDocument(sourceID);
         foreach (string targetID in targetIDs)
         {
             // compute cosine similarity between source and target
             double[] targetDoc = matrix.GetDocument(targetID);
             double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc);
             if (lengthProduct == 0.0)
             {
                 sims.AddLink(sourceID, targetID, 0.0);
             }
             else
             {
                 double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct;
                 sims.AddLink(sourceID, targetID, score);
             }
         }
     }
     return sims;
 }
Beispiel #6
0
 /// <summary>
 /// Computes the term frequencies of each document.
 /// Each term in a vector is divided by the max term in that vector.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>tf-weighted term-by-document matrix</returns>
 public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         double max = matrix.GetDocument(i).Max();
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = matrix[i, j] / max;
         }
     }
     return matrix;
 }