/// <summary> /// Computes cosine similarities between documents via the Vector Space Model. /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="weight">Weighting scheme</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TLArtifactsCollection source, TLArtifactsCollection target, VSMWeightEnum weight) { switch (weight) { case VSMWeightEnum.TFIDF: return(SimilarityUtil.ComputeCosine(WeightUtil.ComputeTFIDF(new TermDocumentMatrix(source, target)), source.Keys, target.Keys)); case VSMWeightEnum.BooleanQueriesAndTFIDFCorpus: return(SimilarityUtil.ComputeCosine(WeightUtil.ComputeBinaryTF(new TermDocumentMatrix(source)), WeightUtil.ComputeTFIDF(new TermDocumentMatrix(target)))); case VSMWeightEnum.NoWeight: return(SimilarityUtil.ComputeCosine(new TermDocumentMatrix(source, target), source.Keys, target.Keys)); default: throw new NotImplementedException("Unknown weighting scheme \"" + weight + "\""); } }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix) { double[] avg = WeightUtil.ComputeAverageVector(matrix); if (avg.Length != matrix.NumTerms) { throw new ArgumentException("Average vector does not have the correct number of terms."); } for (int i = 0; i < matrix.NumDocs; i++) { for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return(matrix); }
/// <summary> /// Smoothing filter from ICPC'11 paper "Improving IR-based Traceability Recovery Using Smoothing Filters" /// </summary> /// <param name="matrix">Term-by-document matrix</param> /// <param name="IDs">Collection of document ids to smooth.</param> /// <returns>Smoothed artifacts</returns> public static TermDocumentMatrix Compute(TermDocumentMatrix matrix, IEnumerable <string> IDs) { double[] avg = WeightUtil.ComputeAverageVector(matrix, IDs); if (avg.Length != matrix.NumTerms) { throw new ArgumentException("Average vector does not have the correct number of terms."); } foreach (string docID in IDs) { int i = matrix.GetDocumentIndex(docID); for (int j = 0; j < matrix.NumTerms; j++) { matrix[i, j] -= avg[j]; if (matrix[i, j] < 0.0) { matrix[i, j] = 0.0; } } } return(matrix); }