Example #1
0
        /// <summary>
        /// Computes the cosine similarity between the given document pairs in the matrix
        /// </summary>
        /// <param name="matrix">Term-by-document matrix</param>
        /// <param name="sourceIDs">Collection of source artifacts ids</param>
        /// /// <param name="targetIDs">Collection of target artifacts ids</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs)
        {
            TLSimilarityMatrix sims = new TLSimilarityMatrix();

            foreach (string sourceID in sourceIDs)
            {
                double[] sourceDoc = matrix.GetDocument(sourceID);
                foreach (string targetID in targetIDs)
                {
                    // compute cosine similarity between source and target
                    double[] targetDoc     = matrix.GetDocument(targetID);
                    double   lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc);
                    if (lengthProduct == 0.0)
                    {
                        sims.AddLink(sourceID, targetID, 0.0);
                    }
                    else
                    {
                        double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct;
                        sims.AddLink(sourceID, targetID, score);
                    }
                }
            }
            return(sims);
        }
Example #2
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public LDAScript(TLArtifactsCollection source, TLArtifactsCollection target, LDAConfig config)
     : base()
 {
     _source = new TermDocumentMatrix(source);
     _target = new TermDocumentMatrix(target);
     _config = config;
 }
Example #3
0
        /// <summary>
        /// Computes cosine similarities between two TermDocumentMatrices.
        /// Cosine similarity is defined as (dot product) / (length * length)
        /// </summary>
        /// <param name="m1">Binary document matrix</param>
        /// <param name="m2">tf-idf weighted document matrix</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2)
        {
            TLSimilarityMatrix        sims     = new TLSimilarityMatrix();
            List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2);

            for (int i = 0; i < m1.NumDocs; i++)
            {
                TLLinksList links = new TLLinksList();
                for (int j = 0; j < m2.NumDocs; j++)
                {
                    double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j));
                    if (lengthProduct == 0.0)
                    {
                        links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0));
                    }
                    else
                    {
                        links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct));
                    }
                }
                links.Sort();
                foreach (TLSingleLink link in links)
                {
                    sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
                }
            }
            return(sims);
        }
Example #4
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public LDAScript(TermDocumentMatrix source, TermDocumentMatrix target, LDAConfig config)
     : base()
 {
     _source = source;
     _target = target;
     _config = config;
 }
Example #5
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="name">Corpus name</param>
 /// <param name="matrix">Input matrix</param>
 /// <param name="sourceIDs">Collection of source artifacts ids</param>
 /// <param name="targetIDs">Collection of target artifacts ids</param>
 public LDACorpus(string name, TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs)
 {
     Name        = name;
     _sourceDocs = sourceIDs;
     _targetDocs = targetIDs;
     _matrix     = matrix;
 }
Example #6
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="name">Corpus name</param>
 /// <param name="source">Source matrix</param>
 /// <param name="target">Target matrix</param>
 public LDACorpus(string name, TermDocumentMatrix source, TermDocumentMatrix target)
 {
     Name        = name;
     _sourceDocs = source.DocMap;
     _targetDocs = target.DocMap;
     _matrix     = TermDocumentMatrix.Combine(source, target);
 }
Example #7
0
 /// <summary>
 /// Computes cosine similarities between two TermDocumentMatrices.
 /// Cosine similarity is defined as (dot product) / (length * length)
 /// </summary>
 /// <param name="m1">Binary document matrix</param>
 /// <param name="m2">tf-idf weighted document matrix</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2)
 {
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2);
     for (int i = 0; i < m1.NumDocs; i++)
     {
         TLLinksList links = new TLLinksList();
         for (int j = 0; j < m2.NumDocs; j++)
         {
             double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j));
             if (lengthProduct == 0.0)
             {
                 links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0));
             }
             else
             {
                 links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct));
             }
         }
         links.Sort();
         foreach (TLSingleLink link in links)
         {
             sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
         }
     }
     return sims;
 }
Example #8
0
        public override void Compute()
        {
            TermDocumentMatrix sourceArtifacts = SmoothingFilter.Compute(new TermDocumentMatrix((TLArtifactsCollection)Workspace.Load("SourceArtifacts")));
            TermDocumentMatrix targetArtifacts = SmoothingFilter.Compute(new TermDocumentMatrix((TLArtifactsCollection)Workspace.Load("TargetArtifacts")));
            TLSimilarityMatrix sims            = VSM.Compute(sourceArtifacts, targetArtifacts);

            Workspace.Store("Similarities", sims);
        }
Example #9
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="name">Corpus name</param>
        /// <param name="source">Source artifacts</param>
        /// <param name="target">Target artifacts</param>
        public LDACorpus(string name, TLArtifactsCollection source, TLArtifactsCollection target)
        {
            Name = name;
            TermDocumentMatrix sMatrix = new TermDocumentMatrix(source);
            TermDocumentMatrix tMatrix = new TermDocumentMatrix(target);

            _sourceDocs = sMatrix.DocMap;
            _targetDocs = tMatrix.DocMap;
            _matrix     = TermDocumentMatrix.Combine(sMatrix, tMatrix);
        }
Example #10
0
 /// <summary>
 /// Computes binary (0|1) terms in documents.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns>
 public static TermDocumentMatrix ComputeBinaryTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return matrix;
 }
Example #11
0
 /// <summary>
 /// Computes binary (0|1) terms in documents.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns>
 public static TermDocumentMatrix ComputeBinaryTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return(matrix);
 }
Example #12
0
 /// <summary>
 /// Computes tf-idf weights on a TDM that has been TF() and an IDF vector
 /// </summary>
 /// <param name="tf">Term-frequency weighted matrix</param>
 /// <param name="idf">Inverse document frequencies vector</param>
 /// <returns>tf-idf weighted TermDocumentMatrix</returns>
 public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf)
 {
     for (int i = 0; i < tf.NumDocs; i++)
     {
         for (int j = 0; j < tf.NumTerms; j++)
         {
             tf[i, j] = tf[i, j] * idf[j];
         }
     }
     return(tf);
 }
Example #13
0
        public override void Compute()
        {
            TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("SourceArtifacts");
            TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("TargetArtifacts");
            TermDocumentMatrix    matrix          = new TermDocumentMatrix(sourceArtifacts, targetArtifacts);

            matrix = SmoothingFilter.Compute(matrix, sourceArtifacts.Keys);
            matrix = SmoothingFilter.Compute(matrix, targetArtifacts.Keys);
            TLSimilarityMatrix sims = SimilarityUtil.ComputeCosine(matrix, sourceArtifacts.Keys, targetArtifacts.Keys);

            Workspace.Store("Similarities", sims);
        }
Example #14
0
 /// <summary>
 /// Computes the term frequencies of each document.
 /// Each term in a vector is divided by the max term in that vector.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>tf-weighted term-by-document matrix</returns>
 public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         double max = matrix.GetDocument(i).Max();
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = matrix[i, j] / max;
         }
     }
     return(matrix);
 }
Example #15
0
 /// <summary>
 /// Computes the document frequencies of each term
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>df-weighted term distribution</returns>
 public static double[] ComputeDF(TermDocumentMatrix matrix)
 {
     double[] df = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         df[j] = 0.0;
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return(df);
 }
Example #16
0
 /// <summary>
 /// Computes the average term vector of the matrix
 /// </summary>
 /// <param name="matrix">Artifacts</param>
 /// <returns>Average vector</returns>
 public static double[] ComputeAverageVector(TermDocumentMatrix matrix)
 {
     double[] avg = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             avg[j] += matrix[i, j];
         }
         avg[j] = avg[j] / matrix.NumDocs;
     }
     return avg;
 }
Example #17
0
 /// <summary>
 /// Computes the average term vector of the matrix
 /// </summary>
 /// <param name="matrix">Artifacts</param>
 /// <returns>Average vector</returns>
 private static double[] ComputeAverageVector(TermDocumentMatrix matrix)
 {
     double[] avg = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             avg[j] += matrix[i, j];
         }
         avg[j] = avg[j] / matrix.NumDocs;
     }
     return(avg);
 }
Example #18
0
 /// <summary>
 /// Computes a vector of the average weight for each term in the matrix
 /// </summary>
 /// <param name="matrix">Input matrix</param>
 /// <param name="IDs">Collection of artifacts ids</param>
 /// <returns>Average vector</returns>
 public static double[] ComputeAverageVector(TermDocumentMatrix matrix, IEnumerable <string> IDs)
 {
     double[] avg = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         foreach (string docID in IDs)
         {
             int docIndex = matrix.GetDocumentIndex(docID);
             avg[j] += matrix[docIndex, j];
         }
         avg[j] = avg[j] / IDs.Count();
     }
     return(avg);
 }
Example #19
0
 /// <summary>
 /// Computes a vector of the average weight for each term in the matrix
 /// </summary>
 /// <param name="matrix">Input matrix</param>
 /// <param name="IDs">Collection of artifacts ids</param>
 /// <returns>Average vector</returns>
 public static double[] ComputeAverageVector(TermDocumentMatrix matrix, IEnumerable<string> IDs)
 {
     double[] avg = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         foreach (string docID in IDs)
         {
             int docIndex = matrix.GetDocumentIndex(docID);
             avg[j] += matrix[docIndex, j];
         }
         avg[j] = avg[j] / IDs.Count();
     }
     return avg;
 }
Example #20
0
        public void ComputeLSA()
        {
            TLArtifactsCollection source = TermDocumentMatrix.Load(@"../../Data/LSA/source.txt").ToTLArtifactsCollection();
            TLArtifactsCollection target = TermDocumentMatrix.Load(@"../../Data/LSA/target.txt").ToTLArtifactsCollection();
            REngine            engine    = new REngine(Settings.Default.RScriptEXE);
            TLSimilarityMatrix matrix    = (TLSimilarityMatrix)engine.Execute(new LSAScript(source, target, new LSAConfig {
                Dimensions = 3
            }));
            TLSimilarityMatrix correct = Similarities.Import(@"../../Data/LSA/correct.txt");

            foreach (TLSingleLink link in matrix.AllLinks)
            {
                Assert.AreEqual(correct.GetScoreForLink(link.SourceArtifactId, link.TargetArtifactId),
                                link.Score,
                                Settings.Default.DoublePrecision
                                );
            }
        }
Example #21
0
 /// <summary>
 /// Computes Jensen-Shannon divergence on two TermDocumentMatrices
 /// </summary>
 /// <param name="source">Source artifacts collection</param>
 /// <param name="target">Target artifacts collection</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
 {
     List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target);
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     for (int i = 0; i < matrices[0].NumDocs; i++)
     {
         TLLinksList list = new TLLinksList();
         for (int j = 0; j < matrices[1].NumDocs; j++)
         {
             list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j),
                 DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j))));
         }
         list.Sort();
         foreach (TLSingleLink link in list)
         {
             sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
         }
     }
     return sims;
 }
        public void ConstructorTest_Artifacts()
        {
            string             data   = @"../../Data/SimpleCorpus.";
            TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt"));
            TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt");

            // counts
            Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
            Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
            // matrix
            for (int i = 0; i < answer.NumDocs; i++)
            {
                Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
                Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
                for (int j = 0; j < answer.NumTerms; j++)
                {
                    Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
                    Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
                }
            }
        }
Example #23
0
        /// <summary>
        /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters"
        /// </summary>
        /// <param name="matrix">Term-by-document matrix</param>
        /// <returns>Smoothed artifacts</returns>
        public static TermDocumentMatrix Compute(TermDocumentMatrix matrix)
        {
            double[] avg = WeightUtil.ComputeAverageVector(matrix);

            if (avg.Length != matrix.NumTerms)
                throw new ArgumentException("Average vector does not have the correct number of terms.");

            for (int i = 0; i < matrix.NumDocs; i++)
            {
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    matrix[i, j] -= avg[j];
                    if (matrix[i, j] < 0.0)
                    {
                        matrix[i, j] = 0.0;
                    }
                }
            }

            return matrix;
        }
Example #24
0
        /// <summary>
        /// Computes Jensen-Shannon divergence on two TermDocumentMatrices
        /// </summary>
        /// <param name="source">Source artifacts collection</param>
        /// <param name="target">Target artifacts collection</param>
        /// <returns>Similarity matrix</returns>
        public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
        {
            List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target);
            TLSimilarityMatrix        sims     = new TLSimilarityMatrix();

            for (int i = 0; i < matrices[0].NumDocs; i++)
            {
                TLLinksList list = new TLLinksList();
                for (int j = 0; j < matrices[1].NumDocs; j++)
                {
                    list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j),
                                              DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j))));
                }
                list.Sort();
                foreach (TLSingleLink link in list)
                {
                    sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
                }
            }
            return(sims);
        }
Example #25
0
        /// <summary>
        /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters"
        /// </summary>
        /// <param name="matrix">Term-by-document matrix</param>
        /// <param name="IDs">Collection of document ids to smooth.</param>
        /// <returns>Smoothed artifacts</returns>
        public static TermDocumentMatrix Compute(TermDocumentMatrix matrix, IEnumerable<string> IDs)
        {
            double[] avg = WeightUtil.ComputeAverageVector(matrix, IDs);

            if (avg.Length != matrix.NumTerms)
                throw new ArgumentException("Average vector does not have the correct number of terms.");

            foreach (string docID in IDs)
            {
                int i = matrix.GetDocumentIndex(docID);
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    matrix[i, j] -= avg[j];
                    if (matrix[i, j] < 0.0)
                    {
                        matrix[i, j] = 0.0;
                    }
                }
            }

            return matrix;
        }
        public void ConstructorTest_Artifacts()
        {
            string             inputData  = Settings.Default.SimpleCorpusDir;
            string             outputData = Path.Combine(inputData, "TermDocumentMatrix");
            TermDocumentMatrix matrix     = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt")));
            TermDocumentMatrix answer     = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt"));

            // counts
            Assert.AreEqual(matrix.NumDocs, answer.NumDocs);
            Assert.AreEqual(matrix.NumTerms, answer.NumTerms);
            // matrix
            for (int i = 0; i < answer.NumDocs; i++)
            {
                Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i));
                Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms);
                for (int j = 0; j < answer.NumTerms; j++)
                {
                    Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j));
                    Assert.AreEqual(matrix[i, j], answer[i, j], 0.0);
                }
            }
        }
Example #27
0
        /// <summary>
        /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters"
        /// </summary>
        /// <param name="artifacts">Artifacts</param>
        /// <returns>Smoothed artifacts</returns>
        public static TermDocumentMatrix Compute(TermDocumentMatrix matrix)
        {
            double[] avg = ComputeAverageVector(matrix);

            if (avg.Length != matrix.NumTerms)
            {
                throw new ArgumentException("Average vector does not have the correct number of terms.");
            }

            for (int i = 0; i < matrix.NumDocs; i++)
            {
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    matrix[i, j] -= avg[j];
                    if (matrix[i, j] < 0.0)
                    {
                        matrix[i, j] = 0.0;
                    }
                }
            }

            return(matrix);
        }
Example #28
0
        /// <summary>
        /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters"
        /// </summary>
        /// <param name="matrix">Term-by-document matrix</param>
        /// <param name="IDs">Collection of document ids to smooth.</param>
        /// <returns>Smoothed artifacts</returns>
        public static TermDocumentMatrix Compute(TermDocumentMatrix matrix, IEnumerable <string> IDs)
        {
            double[] avg = WeightUtil.ComputeAverageVector(matrix, IDs);

            if (avg.Length != matrix.NumTerms)
            {
                throw new ArgumentException("Average vector does not have the correct number of terms.");
            }

            foreach (string docID in IDs)
            {
                int i = matrix.GetDocumentIndex(docID);
                for (int j = 0; j < matrix.NumTerms; j++)
                {
                    matrix[i, j] -= avg[j];
                    if (matrix[i, j] < 0.0)
                    {
                        matrix[i, j] = 0.0;
                    }
                }
            }

            return(matrix);
        }
Example #29
0
 /// <summary>
 /// Computes the cosine similarity between the given document pairs in the matrix
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <param name="sourceIDs">Collection of source artifacts ids</param>
 /// /// <param name="targetIDs">Collection of target artifacts ids</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs)
 {
     TLSimilarityMatrix sims = new TLSimilarityMatrix();
     foreach (string sourceID in sourceIDs)
     {
         double[] sourceDoc = matrix.GetDocument(sourceID);
         foreach (string targetID in targetIDs)
         {
             // compute cosine similarity between source and target
             double[] targetDoc = matrix.GetDocument(targetID);
             double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc);
             if (lengthProduct == 0.0)
             {
                 sims.AddLink(sourceID, targetID, 0.0);
             }
             else
             {
                 double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct;
                 sims.AddLink(sourceID, targetID, score);
             }
         }
     }
     return sims;
 }
Example #30
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public RTMScript(TermDocumentMatrix source, TermDocumentMatrix target, RTMConfig config) : base()
 {
     _corpus = new LDACorpus("RTM", source, target);
     _config = config;
 }
Example #31
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="matrix">Input matrix</param>
 /// <param name="sourceIDs">Source artifacts ids</param>
 /// <param name="targetIDs">Target artifacts ids</param>
 /// <param name="config">Configuration object</param>
 public RTMScript(TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs, RTMConfig config)
 {
     _corpus = new LDACorpus("RTM", matrix, sourceIDs, targetIDs);
     _config = config;
 }
 public override void Compute()
 {
     Workspace.Store("Artifacts", TermDocumentMatrix.LoadTransposed(_config.CorpusDocument.Absolute).ToTLArtifactsCollection());
 }
 public override void Compute()
 {
     Workspace.Store("Artifacts", TermDocumentMatrix.Load(_config.CorpusDocument.Absolute));
 }
Example #34
0
 /// <summary>
 /// Computes the inverse document frequencies of a TermDocumentMatrix
 /// </summary>
 /// <param name="matrix">TDM</param>
 /// <returns>IDF vector</returns>
 public static double[] ComputeIDF(TermDocumentMatrix matrix)
 {
     return ComputeIDF(ComputeDF(matrix), matrix.NumDocs);
 }
Example #35
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public LDAScript(TLArtifactsCollection source, TLArtifactsCollection target, LDAConfig config) : base()
 {
     _source = new TermDocumentMatrix(source);
     _target = new TermDocumentMatrix(target);
     _config = config;
 }
Example #36
0
 /// <summary>
 /// Computes the document frequencies of each term
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>df-weighted term distribution</returns>
 public static double[] ComputeDF(TermDocumentMatrix matrix)
 {
     double[] df = new double[matrix.NumTerms];
     for (int j = 0; j < matrix.NumTerms; j++)
     {
         df[j] = 0.0;
         for (int i = 0; i < matrix.NumDocs; i++)
         {
             df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0;
         }
     }
     return df;
 }
Example #37
0
 /// <summary>
 /// Computes tf-idf weights on a TermDocumentMatrix
 /// </summary>
 /// <param name="matrix"></param>
 /// <returns></returns>
 public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix matrix)
 {
     return(ComputeTFIDF(ComputeTF(matrix), ComputeIDF(matrix)));
 }
Example #38
0
 /// <summary>
 /// Computes tf-idf weights on a TDM that has been TF() and an IDF vector
 /// </summary>
 /// <param name="tf">Term-frequency weighted matrix</param>
 /// <param name="idf">Inverse document frequencies vector</param>
 /// <returns>tf-idf weighted TermDocumentMatrix</returns>
 public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf)
 {
     for (int i = 0; i < tf.NumDocs; i++)
     {
         for (int j = 0; j < tf.NumTerms; j++)
         {
             tf[i, j] = tf[i, j] * idf[j];
         }
     }
     return tf;
 }
Example #39
0
 /// <summary>
 /// Computes tf-idf weights on a TermDocumentMatrix
 /// </summary>
 /// <param name="matrix"></param>
 /// <returns></returns>
 public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix matrix)
 {
     return ComputeTFIDF(ComputeTF(matrix), ComputeIDF(matrix));
 }
Example #40
0
 /// <summary>
 /// Computes the term frequencies of each document.
 /// Each term in a vector is divided by the max term in that vector.
 /// </summary>
 /// <param name="matrix">Term-by-document matrix</param>
 /// <returns>tf-weighted term-by-document matrix</returns>
 public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix)
 {
     for (int i = 0; i < matrix.NumDocs; i++)
     {
         double max = matrix.GetDocument(i).Max();
         for (int j = 0; j < matrix.NumTerms; j++)
         {
             matrix[i, j] = matrix[i, j] / max;
         }
     }
     return matrix;
 }
Example #41
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public RTMScript(TermDocumentMatrix source, TermDocumentMatrix target, RTMConfig config)
     : base()
 {
     _corpus = new LDACorpus("RTM", source, target);
     _config = config;
 }
Example #42
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="matrix">Input matrix</param>
 /// <param name="sourceIDs">Source artifacts ids</param>
 /// <param name="targetIDs">Target artifacts ids</param>
 /// <param name="config">Configuration object</param>
 public RTMScript(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs, RTMConfig config)
 {
     _corpus = new LDACorpus("RTM", matrix, sourceIDs, targetIDs);
     _config = config;
 }
Example #43
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public LDAScript(TermDocumentMatrix source, TermDocumentMatrix target, LDAConfig config) : base()
 {
     _source = source;
     _target = target;
     _config = config;
 }
Example #44
0
 /// <summary>
 /// Computes the inverse document frequencies of a TermDocumentMatrix
 /// </summary>
 /// <param name="matrix">TDM</param>
 /// <returns>IDF vector</returns>
 public static double[] ComputeIDF(TermDocumentMatrix matrix)
 {
     return(ComputeIDF(ComputeDF(matrix), matrix.NumDocs));
 }