Example #1
0
        /// <summary>
        /// Computes cosine similarities between two TermDocumentMatrices.
        /// Cosine similarity is defined as (dot product) / (length * length)
        /// </summary>
        /// <param name="m1">Binary document matrix</param>
        /// <param name="m2">tf-idf weighted document matrix</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2)
        {
            TLSimilarityMatrix        sims     = new TLSimilarityMatrix();
            List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2);

            for (int i = 0; i < m1.NumDocs; i++)
            {
                TLLinksList links = new TLLinksList();
                for (int j = 0; j < m2.NumDocs; j++)
                {
                    double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j));
                    if (lengthProduct == 0.0)
                    {
                        links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0));
                    }
                    else
                    {
                        links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct));
                    }
                }
                links.Sort();
                foreach (TLSingleLink link in links)
                {
                    sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
                }
            }
            return(sims);
        }
Example #2
0
 /// <summary>
 /// Computes cosine similarities between two TermDocumentMatrices.
 /// Cosine similarity is defined as (dot product) / (length * length)
 /// </summary>
 /// <param name="m1">Binary document matrix</param>
 /// <param name="m2">tf-idf weighted document matrix</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2)
 {
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2);
     for (int i = 0; i < m1.NumDocs; i++)
     {
         TLLinksList links = new TLLinksList();
         for (int j = 0; j < m2.NumDocs; j++)
         {
             double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j));
             if (lengthProduct == 0.0)
             {
                 links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0));
             }
             else
             {
                 links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct));
             }
         }
         links.Sort();
         foreach (TLSingleLink link in links)
         {
             sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
         }
     }
     return sims;
 }
        public void ConstructorTest_Artifacts()
        {
            string             data   = @"../../Data/SimpleCorpus.";
            TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt"));
            TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt");

            // counts
            Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
            Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
            // matrix
            for (int i = 0; i < answer.NumDocs; i++)
            {
                Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
                Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
                for (int j = 0; j < answer.NumTerms; j++)
                {
                    Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
                    Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
                }
            }
        }
        public void ConstructorTest_Artifacts()
        {
            string             inputData  = Settings.Default.SimpleCorpusDir;
            string             outputData = Path.Combine(inputData, "TermDocumentMatrix");
            TermDocumentMatrix matrix     = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt")));
            TermDocumentMatrix answer     = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt"));

            // counts
            Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
            Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
            // matrix
            for (int i = 0; i < answer.NumDocs; i++)
            {
                Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
                Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
                for (int j = 0; j < answer.NumTerms; j++)
                {
                    Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
                    Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
                }
            }
        }