public void GetTopNLinks() { TLLinksList list = TLSimilarityMatrixUtil.GetTopNLinks(sims, 4); #if Verbose Console.WriteLine("TLSimilarityMatrixUtilTest.GetTopNLinks()"); for (int i = 0; i < list.Count; i++) { Console.WriteLine("{0}\t{1}\t{2}", list[i].SourceArtifactId, list[i].TargetArtifactId, list[i].Score ); } #endif Assert.AreEqual(4, list.Count); TLLinksList expected = new TLLinksList(); expected.Add(new TLSingleLink("A", "B*", 10)); expected.Add(new TLSingleLink("A", "E", 9)); expected.Add(new TLSingleLink("A", "F", 8)); expected.Add(new TLSingleLink("A", "C*", 7)); for (int i = 0; i < expected.Count; i++) { Assert.AreEqual(expected[i], list[i]); } }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public void GetLinksAtRecall100() { TLLinksList list = TLSimilarityMatrixUtil.GetLinksAtRecall(sims, oracle, 1.0); #if Verbose Console.WriteLine("TLSimilarityMatrixUtilTest.GetLinksAtRecall100()"); for (int i = 0; i < list.Count; i++) { Console.WriteLine("{0}\t{1}\t{2}", list[i].SourceArtifactId, list[i].TargetArtifactId, list[i].Score ); } #endif Assert.AreEqual(9, list.Count); TLLinksList expected = new TLLinksList(); expected.Add(new TLSingleLink("A", "B*", 10)); expected.Add(new TLSingleLink("A", "E", 9)); expected.Add(new TLSingleLink("A", "F", 8)); expected.Add(new TLSingleLink("A", "C*", 7)); expected.Add(new TLSingleLink("A", "G", 6)); expected.Add(new TLSingleLink("A", "H", 5)); expected.Add(new TLSingleLink("A", "I", 4)); expected.Add(new TLSingleLink("A", "J", 3)); expected.Add(new TLSingleLink("A", "D*", 2)); for (int i = 0; i < expected.Count; i++) { Assert.AreEqual(expected[i], list[i]); } }
/// <summary> /// Returns links for the desired recall level. /// </summary> /// <param name="matrix">Candidate matrix</param> /// <param name="answerMatrix">Answer matrix</param> /// <param name="level">Desired recall level</param> /// <returns>List of links at desired recall</returns> public static TLLinksList GetLinksAtRecall(TLSimilarityMatrix matrix, TLSimilarityMatrix answerMatrix, double level) { if (level <= 0.0 || level > 1.0) { throw new DevelopmentKitException("Recall level must be between 0 and 1."); } double totalCorrect = answerMatrix.Count * level; int numCorrect = 0; TLLinksList links = matrix.AllLinks; links.Sort(); TLLinksList newLinks = new TLLinksList(); while (links.Count > 0 && numCorrect < totalCorrect) { TLSingleLink link = links[0]; if (answerMatrix.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; } newLinks.Add(link); links.RemoveAt(0); } return(newLinks); }
public void GetLinksAboveThresholdDefault() { sims.Threshold = 4; TLLinksList list = TLSimilarityMatrixUtil.GetLinksAboveThreshold(sims); list.Sort(); #if Verbose Console.WriteLine("TLSimilarityMatrixUtilTest.GetLinksAboveThresholdDefault()"); for (int i = 0; i < list.Count; i++) { Console.WriteLine("{0}\t{1}\t{2}", list[i].SourceArtifactId, list[i].TargetArtifactId, list[i].Score ); } #endif Assert.AreEqual(6, list.Count); TLLinksList expected = new TLLinksList(); expected.Add(new TLSingleLink("A", "B*", 10)); expected.Add(new TLSingleLink("A", "E", 9)); expected.Add(new TLSingleLink("A", "F", 8)); expected.Add(new TLSingleLink("A", "C*", 7)); expected.Add(new TLSingleLink("A", "G", 6)); expected.Add(new TLSingleLink("A", "H", 5)); for (int i = 0; i < expected.Count; i++) { Assert.AreEqual(expected[i], list[i]); } }
/// <summary> /// Returns all links above the given threshold. /// </summary> /// <param name="matrix">Matrix</param> /// <param name="threshold">Score threshold</param> /// <returns>List of links above threshold</returns> public static TLLinksList GetLinksAboveThreshold(TLSimilarityMatrix matrix, double threshold) { TLLinksList links = new TLLinksList(); foreach (TLSingleLink link in matrix.AllLinks) { if (link.Score > threshold) { links.Add(link); } } return(links); }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix matrix, TLSimilarityMatrix relationships) { // create pseudo matrix for easy lookup // Dictionary<sourceID, Dictionary<targetID, score>> Dictionary <string, Dictionary <string, double> > storage = new Dictionary <string, Dictionary <string, double> >(); foreach (TLSingleLink link in matrix.AllLinks) { if (!storage.ContainsKey(link.SourceArtifactId)) { storage.Add(link.SourceArtifactId, new Dictionary <string, double>()); } storage[link.SourceArtifactId].Add(link.TargetArtifactId, link.Score); } #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(matrix); #endif // iterate over every (source, target) pair TLLinksList links = matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { // get the set of target artifacts related to link.TargetArtifactId // then update the value of (link.SourceArtifactId, relatedArtifact) by delta foreach (string relatedArtifact in relationships.GetSetOfTargetArtifactIdsAboveThresholdForSourceArtifact(link.TargetArtifactId)) { #if UseDelta storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * delta; #else storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * 0.1; #endif } } // build new matrix TLLinksList newLinks = new TLLinksList(); foreach (string source in storage.Keys) { foreach (string target in storage[source].Keys) { newLinks.Add(new TLSingleLink(source, target, storage[source][target])); } } newLinks.Sort(); TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in newLinks) { newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } return(newMatrix); }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix matrix, TLSimilarityMatrix relationships) { // create pseudo matrix for easy lookup // Dictionary<sourceID, Dictionary<targetID, score>> Dictionary<string, Dictionary<string, double>> storage = new Dictionary<string, Dictionary<string, double>>(); foreach (TLSingleLink link in matrix.AllLinks) { if (!storage.ContainsKey(link.SourceArtifactId)) { storage.Add(link.SourceArtifactId, new Dictionary<string, double>()); } storage[link.SourceArtifactId].Add(link.TargetArtifactId, link.Score); } #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(matrix); #endif // iterate over every (source, target) pair TLLinksList links = matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { // get the set of target artifacts related to link.TargetArtifactId // then update the value of (link.SourceArtifactId, relatedArtifact) by delta foreach (string relatedArtifact in relationships.GetSetOfTargetArtifactIdsAboveThresholdForSourceArtifact(link.TargetArtifactId)) { #if UseDelta storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * delta; #else storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * 0.1; #endif } } // build new matrix TLLinksList newLinks = new TLLinksList(); foreach (string source in storage.Keys) { foreach (string target in storage[source].Keys) { newLinks.Add(new TLSingleLink(source, target, storage[source][target])); } } newLinks.Sort(); TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in newLinks) { newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } return newMatrix; }
/// <summary> /// Removes a percentage of links from the bottom of the list. /// </summary> /// <param name="links">Ranklist</param> /// <param name="percent">Percentage to remove</param> /// <returns>Trimmed ranklist</returns> public static TLLinksList RemoveBottomPercentage(TLLinksList links, double percent) { if (percent <= 0.0 || percent >= 1.0) { throw new DevelopmentKitException("Percentage level must be between 0 and 1."); } TLLinksList remaining = new TLLinksList(); links.Sort(); int endIndex = Convert.ToInt32(Math.Floor(links.Count * (1 - percent))) - 1; for (int i = 0; i < endIndex; i++) { TLSingleLink link = links[i]; remaining.Add(new TLSingleLink(link.SourceArtifactId, link.TargetArtifactId, link.Score)); } return(remaining); }
public static TLLinksList GetLinksAtRecall(TLSimilarityMatrix sims, TLSimilarityMatrix oracle, RecallLevel level) { double totalCorrect = oracle.Count * RecallLevelUtil.RecallValue(level); int numCorrect = 0; TLLinksList list = new TLLinksList(); TLLinksList links = sims.AllLinks; links.Sort(); while (links.Count > 0 && numCorrect < totalCorrect) { TLSingleLink link = links[0]; if (oracle.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; } list.Add(link); links.RemoveAt(0); } return list; }
public static TLLinksList GetLinksAtRecall(TLSimilarityMatrix sims, TLSimilarityMatrix oracle, RecallLevel level) { double totalCorrect = oracle.Count * RecallLevelUtil.RecallValue(level); int numCorrect = 0; TLLinksList list = new TLLinksList(); TLLinksList links = sims.AllLinks; links.Sort(); while (links.Count > 0 && numCorrect < totalCorrect) { TLSingleLink link = links[0]; if (oracle.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; } list.Add(link); links.RemoveAt(0); } return(list); }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Returns the top N scoring links in a matrix. /// </summary> /// <param name="matrix">Matrix</param> /// <param name="topN">Number of links to return</param> /// <returns>List of top N links</returns> public static TLLinksList GetTopNLinks(TLSimilarityMatrix matrix, int topN) { if (matrix.AllLinks.Count < topN) { throw new DevelopmentKitException("Matrix only has " + matrix.AllLinks.Count + " links (" + topN + " requested)."); } if (topN < 1) { throw new DevelopmentKitException("topN must be greater than 0."); } TLLinksList links = matrix.AllLinks; links.Sort(); TLLinksList newLinks = new TLLinksList(); for (int i = 0; i < topN; i++) { newLinks.Add(links[i]); } return(newLinks); }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Get relevant links for source artifacts with score larger than threshold. /// </summary> /// <param name="sourceArtifactId">Id of source artifact for which the set of relevant/retrieved links is requested</param> /// <returns>Hashset of target artifacts ids that are retrieved or relevant to the given source artifact (depends on usage).</returns> public TLLinksList GetLinksAboveThresholdForSourceArtifact(string sourceArtifactId) { TLLinksList linksForSourceArtifact; if (CacheOfLinksPerSourceArtifacts.TryGetValue(sourceArtifactId, out linksForSourceArtifact) == false) { linksForSourceArtifact = new TLLinksList(); Dictionary<string, double> links; if (m_matrix.TryGetValue(sourceArtifactId, out links)) { foreach (string targetArtifactId in links.Keys) { if (links[targetArtifactId] > Threshold) { linksForSourceArtifact.Add(new TLSingleLink(sourceArtifactId, targetArtifactId, links[targetArtifactId])); } } } CacheOfLinksPerSourceArtifacts.Add(sourceArtifactId, linksForSourceArtifact); } return linksForSourceArtifact; //return empty set }
/// <summary> /// Removes a percentage of links from the top of the list. /// </summary> /// <param name="links">Ranklist</param> /// <param name="percent">Percentage to remove</param> /// <returns>Trimmed ranklist</returns> public static TLLinksList RemoveTopPercentage(TLLinksList links, double percent) { if (percent <= 0.0 || percent >= 1.0) { throw new DevelopmentKitException("Percentage level must be between 0 and 1."); } TLLinksList remaining = new TLLinksList(); links.Sort(); int startIndex = Convert.ToInt32(Math.Ceiling(links.Count * percent)) - 1; for (int i = startIndex; i < links.Count; i++) { TLSingleLink link = links[i]; remaining.Add(new TLSingleLink(link.SourceArtifactId, link.TargetArtifactId, link.Score)); } return remaining; }
/// <summary> /// Returns the top N scoring links in a matrix. /// </summary> /// <param name="matrix">Matrix</param> /// <param name="topN">Number of links to return</param> /// <returns>List of top N links</returns> public static TLLinksList GetTopNLinks(TLSimilarityMatrix matrix, int topN) { if (matrix.AllLinks.Count < topN) { throw new DevelopmentKitException("Matrix only has " + matrix.AllLinks.Count + " links (" + topN + " requested)."); } if (topN < 1) { throw new DevelopmentKitException("topN must be greater than 0."); } TLLinksList links = matrix.AllLinks; links.Sort(); TLLinksList newLinks = new TLLinksList(); for (int i = 0; i < topN; i++) { newLinks.Add(links[i]); } return newLinks; }
/// <summary> /// Returns links for the desired recall level. /// </summary> /// <param name="matrix">Candidate matrix</param> /// <param name="answerMatrix">Answer matrix</param> /// <param name="level">Desired recall level</param> /// <returns>List of links at desired recall</returns> public static TLLinksList GetLinksAtRecall(TLSimilarityMatrix matrix, TLSimilarityMatrix answerMatrix, double level) { if (level <= 0.0 || level > 1.0) { throw new DevelopmentKitException("Recall level must be between 0 and 1."); } double totalCorrect = answerMatrix.Count * level; int numCorrect = 0; TLLinksList links = matrix.AllLinks; links.Sort(); TLLinksList newLinks = new TLLinksList(); while (links.Count > 0 && numCorrect < totalCorrect) { TLSingleLink link = links[0]; if (answerMatrix.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; } newLinks.Add(link); links.RemoveAt(0); } return newLinks; }
/// <summary> /// Returns all links above the given threshold. /// </summary> /// <param name="matrix">Matrix</param> /// <param name="threshold">Score threshold</param> /// <returns>List of links above threshold</returns> public static TLLinksList GetLinksAboveThreshold(TLSimilarityMatrix matrix, double threshold) { TLLinksList links = new TLLinksList(); foreach (TLSingleLink link in matrix.AllLinks) { if (link.Score > threshold) links.Add(link); } return links; }