/// <summary> /// Computes the cosine similarity between the given document pairs in the matrix /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="sourceIDs">Collection of source artifacts ids</param> /// /// <param name="targetIDs">Collection of target artifacts ids</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (string sourceID in sourceIDs) { double[] sourceDoc = matrix.GetDocument(sourceID); foreach (string targetID in targetIDs) { // compute cosine similarity between source and target double[] targetDoc = matrix.GetDocument(targetID); double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc); if (lengthProduct == 0.0) { sims.AddLink(sourceID, targetID, 0.0); } else { double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct; sims.AddLink(sourceID, targetID, score); } } } return(sims); }
/// <summary> /// Constructor /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="config">Configuration object</param> public LDAScript(TLArtifactsCollection source, TLArtifactsCollection target, LDAConfig config) : base() { _source = new TermDocumentMatrix(source); _target = new TermDocumentMatrix(target); _config = config; }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Constructor /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="config">Configuration object</param> public LDAScript(TermDocumentMatrix source, TermDocumentMatrix target, LDAConfig config) : base() { _source = source; _target = target; _config = config; }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="matrix">Input matrix</param> /// <param name="sourceIDs">Collection of source artifacts ids</param> /// <param name="targetIDs">Collection of target artifacts ids</param> public LDACorpus(string name, TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs) { Name = name; _sourceDocs = sourceIDs; _targetDocs = targetIDs; _matrix = matrix; }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="source">Source matrix</param> /// <param name="target">Target matrix</param> public LDACorpus(string name, TermDocumentMatrix source, TermDocumentMatrix target) { Name = name; _sourceDocs = source.DocMap; _targetDocs = target.DocMap; _matrix = TermDocumentMatrix.Combine(source, target); }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public override void Compute() { TermDocumentMatrix sourceArtifacts = SmoothingFilter.Compute(new TermDocumentMatrix((TLArtifactsCollection)Workspace.Load("SourceArtifacts"))); TermDocumentMatrix targetArtifacts = SmoothingFilter.Compute(new TermDocumentMatrix((TLArtifactsCollection)Workspace.Load("TargetArtifacts"))); TLSimilarityMatrix sims = VSM.Compute(sourceArtifacts, targetArtifacts); Workspace.Store("Similarities", sims); }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> public LDACorpus(string name, TLArtifactsCollection source, TLArtifactsCollection target) { Name = name; TermDocumentMatrix sMatrix = new TermDocumentMatrix(source); TermDocumentMatrix tMatrix = new TermDocumentMatrix(target); _sourceDocs = sMatrix.DocMap; _targetDocs = tMatrix.DocMap; _matrix = TermDocumentMatrix.Combine(sMatrix, tMatrix); }
/// <summary> /// Computes binary (0|1) terms in documents. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns> public static TermDocumentMatrix ComputeBinaryTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0; } } return matrix; }
/// <summary> /// Computes binary (0|1) terms in documents. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>Term-by-document matrix with 1s for terms that are in the document and 0s for terms that are not.</returns> public static TermDocumentMatrix ComputeBinaryTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0; } } return(matrix); }
/// <summary> /// Computes tf-idf weights on a TDM that has been TF() and an IDF vector /// </summary> /// <param name="tf">Term-frequency weighted matrix</param> /// <param name="idf">Inverse document frequencies vector</param> /// <returns>tf-idf weighted TermDocumentMatrix</returns> public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf) { for (int i = 0; i < tf.NumDocs; i++) { for (int j = 0; j < tf.NumTerms; j++) { tf[i, j] = tf[i, j] * idf[j]; } } return(tf); }
public override void Compute() { TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("SourceArtifacts"); TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("TargetArtifacts"); TermDocumentMatrix matrix = new TermDocumentMatrix(sourceArtifacts, targetArtifacts); matrix = SmoothingFilter.Compute(matrix, sourceArtifacts.Keys); matrix = SmoothingFilter.Compute(matrix, targetArtifacts.Keys); TLSimilarityMatrix sims = SimilarityUtil.ComputeCosine(matrix, sourceArtifacts.Keys, targetArtifacts.Keys); Workspace.Store("Similarities", sims); }
/// <summary> /// Computes the term frequencies of each document. /// Each term in a vector is divided by the max term in that vector. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>tf-weighted term-by-document matrix</returns> public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { double max = matrix.GetDocument(i).Max(); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = matrix[i, j] / max; } } return(matrix); }
/// <summary> /// Computes the document frequencies of each term /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>df-weighted term distribution</returns> public static double[] ComputeDF(TermDocumentMatrix matrix) { double[] df = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { df[j] = 0.0; for (int i = 0; i < matrix.NumDocs; i++) { df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0; } } return(df); }
/// <summary> /// Computes the average term vector of the matrix /// </summary> /// <param name="matrix">Artifacts</param> /// <returns>Average vector</returns> public static double[] ComputeAverageVector(TermDocumentMatrix matrix) { double[] avg = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { for (int i = 0; i < matrix.NumDocs; i++) { avg[j] += matrix[i, j]; } avg[j] = avg[j] / matrix.NumDocs; } return avg; }
/// <summary> /// Computes the average term vector of the matrix /// </summary> /// <param name="matrix">Artifacts</param> /// <returns>Average vector</returns> private static double[] ComputeAverageVector(TermDocumentMatrix matrix) { double[] avg = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { for (int i = 0; i < matrix.NumDocs; i++) { avg[j] += matrix[i, j]; } avg[j] = avg[j] / matrix.NumDocs; } return(avg); }
/// <summary> /// Computes a vector of the average weight for each term in the matrix /// </summary> /// <param name="matrix">Input matrix</param> /// <param name="IDs">Collection of artifacts ids</param> /// <returns>Average vector</returns> public static double[] ComputeAverageVector(TermDocumentMatrix matrix, IEnumerable <string> IDs) { double[] avg = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { foreach (string docID in IDs) { int docIndex = matrix.GetDocumentIndex(docID); avg[j] += matrix[docIndex, j]; } avg[j] = avg[j] / IDs.Count(); } return(avg); }
/// <summary> /// Computes a vector of the average weight for each term in the matrix /// </summary> /// <param name="matrix">Input matrix</param> /// <param name="IDs">Collection of artifacts ids</param> /// <returns>Average vector</returns> public static double[] ComputeAverageVector(TermDocumentMatrix matrix, IEnumerable<string> IDs) { double[] avg = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { foreach (string docID in IDs) { int docIndex = matrix.GetDocumentIndex(docID); avg[j] += matrix[docIndex, j]; } avg[j] = avg[j] / IDs.Count(); } return avg; }
public void ComputeLSA() { TLArtifactsCollection source = TermDocumentMatrix.Load(@"../../Data/LSA/source.txt").ToTLArtifactsCollection(); TLArtifactsCollection target = TermDocumentMatrix.Load(@"../../Data/LSA/target.txt").ToTLArtifactsCollection(); REngine engine = new REngine(Settings.Default.RScriptEXE); TLSimilarityMatrix matrix = (TLSimilarityMatrix)engine.Execute(new LSAScript(source, target, new LSAConfig { Dimensions = 3 })); TLSimilarityMatrix correct = Similarities.Import(@"../../Data/LSA/correct.txt"); foreach (TLSingleLink link in matrix.AllLinks) { Assert.AreEqual(correct.GetScoreForLink(link.SourceArtifactId, link.TargetArtifactId), link.Score, Settings.Default.DoublePrecision ); } }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public void ConstructorTest_Artifacts() { string data = @"../../Data/SimpleCorpus."; TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.Import(data + "input.target.txt")); TermDocumentMatrix answer = TermDocumentMatrix.Load(data + "output.target.matrix.txt"); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix) { double[] avg = WeightUtil.ComputeAverageVector(matrix); if (avg.Length != matrix.NumTerms) throw new ArgumentException("Average vector does not have the correct number of terms."); for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return matrix; }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="IDs">Collection of document ids to smooth.</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix, IEnumerable<string> IDs) { double[] avg = WeightUtil.ComputeAverageVector(matrix, IDs); if (avg.Length != matrix.NumTerms) throw new ArgumentException("Average vector does not have the correct number of terms."); foreach (string docID in IDs) { int i = matrix.GetDocumentIndex(docID); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return matrix; }
public void ConstructorTest_Artifacts() { string inputData = Settings.Default.SimpleCorpusDir; string outputData = Path.Combine(inputData, "TermDocumentMatrix"); TermDocumentMatrix matrix = new TermDocumentMatrix(Artifacts.ImportFile(Path.Combine(inputData, "target.txt"))); TermDocumentMatrix answer = TermDocumentMatrix.Load(Path.Combine(outputData, "output.txt")); // counts Assert.AreEqual(matrix.NumDocs, answer.NumDocs); Assert.AreEqual(matrix.NumTerms, answer.NumTerms); // matrix for (int i = 0; i < answer.NumDocs; i++) { Assert.AreEqual(matrix.GetDocumentName(i), answer.GetDocumentName(i)); Assert.AreEqual(matrix.GetDocument(i).Length, answer.NumTerms); for (int j = 0; j < answer.NumTerms; j++) { Assert.AreEqual(matrix.GetTermName(j), answer.GetTermName(j)); Assert.AreEqual(matrix[i, j], answer[i, j], 0.0); } } }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="artifacts">Artifacts</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix) { double[] avg = ComputeAverageVector(matrix); if (avg.Length != matrix.NumTerms) { throw new ArgumentException("Average vector does not have the correct number of terms."); } for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return(matrix); }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="IDs">Collection of document ids to smooth.</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix, IEnumerable <string> IDs) { double[] avg = WeightUtil.ComputeAverageVector(matrix, IDs); if (avg.Length != matrix.NumTerms) { throw new ArgumentException("Average vector does not have the correct number of terms."); } foreach (string docID in IDs) { int i = matrix.GetDocumentIndex(docID); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return(matrix); }
/// <summary> /// Computes the cosine similarity between the given document pairs in the matrix /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="sourceIDs">Collection of source artifacts ids</param> /// /// <param name="targetIDs">Collection of target artifacts ids</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (string sourceID in sourceIDs) { double[] sourceDoc = matrix.GetDocument(sourceID); foreach (string targetID in targetIDs) { // compute cosine similarity between source and target double[] targetDoc = matrix.GetDocument(targetID); double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc); if (lengthProduct == 0.0) { sims.AddLink(sourceID, targetID, 0.0); } else { double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct; sims.AddLink(sourceID, targetID, score); } } } return sims; }
/// <summary> /// Constructor /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="config">Configuration object</param> public RTMScript(TermDocumentMatrix source, TermDocumentMatrix target, RTMConfig config) : base() { _corpus = new LDACorpus("RTM", source, target); _config = config; }
/// <summary> /// Constructor /// </summary> /// <param name="matrix">Input matrix</param> /// <param name="sourceIDs">Source artifacts ids</param> /// <param name="targetIDs">Target artifacts ids</param> /// <param name="config">Configuration object</param> public RTMScript(TermDocumentMatrix matrix, IEnumerable <string> sourceIDs, IEnumerable <string> targetIDs, RTMConfig config) { _corpus = new LDACorpus("RTM", matrix, sourceIDs, targetIDs); _config = config; }
public override void Compute() { Workspace.Store("Artifacts", TermDocumentMatrix.LoadTransposed(_config.CorpusDocument.Absolute).ToTLArtifactsCollection()); }
public override void Compute() { Workspace.Store("Artifacts", TermDocumentMatrix.Load(_config.CorpusDocument.Absolute)); }
/// <summary> /// Computes the inverse document frequencies of a TermDocumentMatrix /// </summary> /// <param name="matrix">TDM</param> /// <returns>IDF vector</returns> public static double[] ComputeIDF(TermDocumentMatrix matrix) { return ComputeIDF(ComputeDF(matrix), matrix.NumDocs); }
/// <summary> /// Computes the document frequencies of each term /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>df-weighted term distribution</returns> public static double[] ComputeDF(TermDocumentMatrix matrix) { double[] df = new double[matrix.NumTerms]; for (int j = 0; j < matrix.NumTerms; j++) { df[j] = 0.0; for (int i = 0; i < matrix.NumDocs; i++) { df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0; } } return df; }
/// <summary> /// Computes tf-idf weights on a TermDocumentMatrix /// </summary> /// <param name="matrix"></param> /// <returns></returns> public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix matrix) { return(ComputeTFIDF(ComputeTF(matrix), ComputeIDF(matrix))); }
/// <summary> /// Computes tf-idf weights on a TDM that has been TF() and an IDF vector /// </summary> /// <param name="tf">Term-frequency weighted matrix</param> /// <param name="idf">Inverse document frequencies vector</param> /// <returns>tf-idf weighted TermDocumentMatrix</returns> public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf) { for (int i = 0; i < tf.NumDocs; i++) { for (int j = 0; j < tf.NumTerms; j++) { tf[i, j] = tf[i, j] * idf[j]; } } return tf; }
/// <summary> /// Computes tf-idf weights on a TermDocumentMatrix /// </summary> /// <param name="matrix"></param> /// <returns></returns> public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix matrix) { return ComputeTFIDF(ComputeTF(matrix), ComputeIDF(matrix)); }
/// <summary> /// Computes the term frequencies of each document. /// Each term in a vector is divided by the max term in that vector. /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>tf-weighted term-by-document matrix</returns> public static TermDocumentMatrix ComputeTF(TermDocumentMatrix matrix) { for (int i = 0; i < matrix.NumDocs; i++) { double max = matrix.GetDocument(i).Max(); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] = matrix[i, j] / max; } } return matrix; }
/// <summary> /// Constructor /// </summary> /// <param name="matrix">Input matrix</param> /// <param name="sourceIDs">Source artifacts ids</param> /// <param name="targetIDs">Target artifacts ids</param> /// <param name="config">Configuration object</param> public RTMScript(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs, RTMConfig config) { _corpus = new LDACorpus("RTM", matrix, sourceIDs, targetIDs); _config = config; }
/// <summary> /// Computes the inverse document frequencies of a TermDocumentMatrix /// </summary> /// <param name="matrix">TDM</param> /// <returns>IDF vector</returns> public static double[] ComputeIDF(TermDocumentMatrix matrix) { return(ComputeIDF(ComputeDF(matrix), matrix.NumDocs)); }