public void GetLinksAboveThresholdProvided() { TLLinksList list = TLSimilarityMatrixUtil.GetLinksAboveThreshold(sims, 4); list.Sort(); #if Verbose Console.WriteLine("TLSimilarityMatrixUtilTest.GetLinksAboveThresholdProvided()"); for (int i = 0; i < list.Count; i++) { Console.WriteLine("{0}\t{1}\t{2}", list[i].SourceArtifactId, list[i].TargetArtifactId, list[i].Score ); } #endif Assert.AreEqual(6, list.Count); TLLinksList expected = new TLLinksList(); expected.Add(new TLSingleLink("A", "B*", 10)); expected.Add(new TLSingleLink("A", "E", 9)); expected.Add(new TLSingleLink("A", "F", 8)); expected.Add(new TLSingleLink("A", "C*", 7)); expected.Add(new TLSingleLink("A", "G", 6)); expected.Add(new TLSingleLink("A", "H", 5)); for (int i = 0; i < expected.Count; i++) { Assert.AreEqual(expected[i], list[i]); } }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public static DataSetPairs Compute(TLSimilarityMatrix sims, TLSimilarityMatrix oracle, RecallLevel recall) { TLSimilarityMatrix matrix = Similarities.CreateMatrix(MetricsUtil.GetLinksAtRecall(sims, oracle, recall)); matrix.Threshold = double.MinValue; DataSetPairs pairs = new DataSetPairs(); foreach (string sourceArtifact in oracle.SourceArtifactsIds) { TLLinksList links = matrix.GetLinksAboveThresholdForSourceArtifact(sourceArtifact); links.Sort(); int totalCorrect = oracle.GetLinksAboveThresholdForSourceArtifact(sourceArtifact).Count; int numCorrect = 0; int totalRead = 0; double totalAvgPrecision = 0.0; foreach (TLSingleLink link in links) { totalRead++; if (oracle.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; totalAvgPrecision += numCorrect / (double)totalRead; } } pairs.PrecisionData.Add(new KeyValuePair <string, double>(sourceArtifact, numCorrect / Convert.ToDouble(links.Count))); pairs.RecallData.Add(new KeyValuePair <string, double>(sourceArtifact, Convert.ToDouble(numCorrect) / totalCorrect)); pairs.AveragePrecisionData.Add(new KeyValuePair <string, double>(sourceArtifact, totalAvgPrecision / totalCorrect)); } pairs.MeanAveragePrecisionData.Add(new KeyValuePair <string, double>("#TOTAL", DataSetPairsCollection.CalculateAverage(pairs.AveragePrecisionData))); return(pairs); }
/// <summary> /// Returns links for the desired recall level. /// </summary> /// <param name="matrix">Candidate matrix</param> /// <param name="answerMatrix">Answer matrix</param> /// <param name="level">Desired recall level</param> /// <returns>List of links at desired recall</returns> public static TLLinksList GetLinksAtRecall(TLSimilarityMatrix matrix, TLSimilarityMatrix answerMatrix, double level) { if (level <= 0.0 || level > 1.0) { throw new DevelopmentKitException("Recall level must be between 0 and 1."); } double totalCorrect = answerMatrix.Count * level; int numCorrect = 0; TLLinksList links = matrix.AllLinks; links.Sort(); TLLinksList newLinks = new TLLinksList(); while (links.Count > 0 && numCorrect < totalCorrect) { TLSingleLink link = links[0]; if (answerMatrix.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; } newLinks.Add(link); links.RemoveAt(0); } return(newLinks); }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix sims, TLSimilarityMatrix relationships, TLSimilarityMatrix feedback) { // new matrix TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(sims); #endif // make sure the entire list is sorted TLLinksList links = sims.AllLinks; links.Sort(); // end condition int correct = 0; // iterate over each source-target pair while (links.Count > 0 && correct < feedback.Count) { // get link at top of list TLSingleLink link = links[0]; // check feedback if (feedback.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { correct++; // update related links for (int i = 1; i < links.Count; i++) { if (link.SourceArtifactId.Equals(links[i].SourceArtifactId) && relationships.IsLinkAboveThreshold(link.TargetArtifactId, links[i].TargetArtifactId)) { #if UseDelta links[i].Score += links[i].Score * delta; #else links[i].Score += links[i].Score * 0.1; #endif } } } // remove link newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); links.RemoveAt(0); // reorder links links.Sort(); } return(newMatrix); }
/// <summary> /// Private method to write TLSimilarityMatrix to CSV /// </summary> /// <param name="similarityMatrix">Matrix</param> /// <param name="writeFile">Open TextWriter stream</param> private static void WriteMatrixCSV(TLSimilarityMatrix similarityMatrix, System.IO.TextWriter writeFile) { //header writeFile.WriteLine("Source Artifact Id,Target Artifact Id,Probability"); TLLinksList traceLinks = similarityMatrix.AllLinks; traceLinks.Sort(); foreach (TLSingleLink link in traceLinks) { writeFile.WriteLine("{0},{1},{2}", link.SourceArtifactId, link.TargetArtifactId, link.Score); } }
/// <summary> /// Private method to export a TLSimilarityMatrix to CSV with an additional column for correct links /// 0 - incorrect link /// 1 - correct link /// </summary> /// <param name="similarityMatrix">Candidate Matrix</param> /// <param name="answerMatrix">Answer Matrix</param> /// <param name="writeFile">Open TextWriter stream</param> private static void WriteMatrixCSVWithCorrectness(TLSimilarityMatrix similarityMatrix, TLSimilarityMatrix answerMatrix, System.IO.TextWriter writeFile) { //header writeFile.WriteLine("Source Artifact Id,Target Artifact Id,Probability,Is correct"); TLLinksList traceLinks = similarityMatrix.AllLinks; traceLinks.Sort(); foreach (TLSingleLink link in traceLinks) { writeFile.WriteLine("{0},{1},{2},{3}", link.SourceArtifactId, link.TargetArtifactId, link.Score, (answerMatrix.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) ? "1" : "0"); } }
/// <summary> /// Exports TLSimilarityMatrix to file in the form (each line): /// SOURCE TARGET SCORE /// </summary> /// <param name="matrix">Similarity matrix</param> /// <param name="filename">Output file</param> public static void Export(TLSimilarityMatrix matrix, string filename) { TextWriter file = new StreamWriter(filename); TLLinksList links = matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { file.WriteLine("{0}\t{1}\t{2}", link.SourceArtifactId, link.TargetArtifactId, link.Score); } file.Flush(); file.Close(); }
/// <summary> /// Computes the specified tracing results. /// </summary> /// <param name="tracingResults">The tracing results.</param> /// <param name="dataset">The dataset.</param> /// <returns></returns> public override Metric Compute(SingleTracingResults tracingResults, TLDataset dataset) { LineSeries precisionRecallCurve = new LineSeries(MetricName, MetricDescription); //only if tracing results are not null... if (tracingResults != null) { var resultMatrix = tracingResults.ResultMatrix; var answerSet = dataset.AnswerSet; var sourceArtifacts = dataset.SourceArtifacts; TLLinksList resultLinks = resultMatrix.AllLinks; resultLinks.Sort(); int numberOfRelevant = 0; foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { numberOfRelevant += answerSet.GetCountOfLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); } //add point only if number of relevant and number of retrieved links are greater than 0 //basically don't allow division by 0 if (numberOfRelevant == 0) { m_logger.Warn("Number of relevant links is 0, thus the recall value cannot be computed for the Precision Recall Curve"); } else { int numberOfCorrectlyRetrieved = 0; int numberOfRetrieved = 0; foreach (TLSingleLink link in resultLinks) { numberOfRetrieved++; //check if this is a relevant link if (answerSet.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numberOfCorrectlyRetrieved++; } double recall = (double)numberOfCorrectlyRetrieved / numberOfRelevant; //don't need to check if number of retrieved is greater than 0 as it is always the case double precision = (double)numberOfCorrectlyRetrieved / numberOfRetrieved; precisionRecallCurve.AddPoint(new Point(recall, precision)); } } } return(precisionRecallCurve); }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix matrix, TLSimilarityMatrix relationships) { // create pseudo matrix for easy lookup // Dictionary<sourceID, Dictionary<targetID, score>> Dictionary <string, Dictionary <string, double> > storage = new Dictionary <string, Dictionary <string, double> >(); foreach (TLSingleLink link in matrix.AllLinks) { if (!storage.ContainsKey(link.SourceArtifactId)) { storage.Add(link.SourceArtifactId, new Dictionary <string, double>()); } storage[link.SourceArtifactId].Add(link.TargetArtifactId, link.Score); } #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(matrix); #endif // iterate over every (source, target) pair TLLinksList links = matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { // get the set of target artifacts related to link.TargetArtifactId // then update the value of (link.SourceArtifactId, relatedArtifact) by delta foreach (string relatedArtifact in relationships.GetSetOfTargetArtifactIdsAboveThresholdForSourceArtifact(link.TargetArtifactId)) { #if UseDelta storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * delta; #else storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * 0.1; #endif } } // build new matrix TLLinksList newLinks = new TLLinksList(); foreach (string source in storage.Keys) { foreach (string target in storage[source].Keys) { newLinks.Add(new TLSingleLink(source, target, storage[source][target])); } } newLinks.Sort(); TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in newLinks) { newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } return(newMatrix); }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix matrix, TLSimilarityMatrix relationships) { // create pseudo matrix for easy lookup // Dictionary<sourceID, Dictionary<targetID, score>> Dictionary<string, Dictionary<string, double>> storage = new Dictionary<string, Dictionary<string, double>>(); foreach (TLSingleLink link in matrix.AllLinks) { if (!storage.ContainsKey(link.SourceArtifactId)) { storage.Add(link.SourceArtifactId, new Dictionary<string, double>()); } storage[link.SourceArtifactId].Add(link.TargetArtifactId, link.Score); } #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(matrix); #endif // iterate over every (source, target) pair TLLinksList links = matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { // get the set of target artifacts related to link.TargetArtifactId // then update the value of (link.SourceArtifactId, relatedArtifact) by delta foreach (string relatedArtifact in relationships.GetSetOfTargetArtifactIdsAboveThresholdForSourceArtifact(link.TargetArtifactId)) { #if UseDelta storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * delta; #else storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * 0.1; #endif } } // build new matrix TLLinksList newLinks = new TLLinksList(); foreach (string source in storage.Keys) { foreach (string target in storage[source].Keys) { newLinks.Add(new TLSingleLink(source, target, storage[source][target])); } } newLinks.Sort(); TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in newLinks) { newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } return newMatrix; }
public static int GetLinkPos(TLSimilarityMatrix sims, TLSingleLink link) { TLLinksList list = sims.AllLinks; list.Sort(); int pos = 1; foreach (TLSingleLink query in list) { if (query.SourceArtifactId.Equals(link.SourceArtifactId) && query.TargetArtifactId.Equals(link.TargetArtifactId)) { return(pos); } pos++; } return(-1); }
/// <summary> /// Computes the effectiveness all measure of the given similarity matrix using the answer matrix provided. /// </summary> protected override void ComputeImplementation() { _oracle.Threshold = 0; Results = new SerializableDictionary <string, double>(); foreach (string query in _oracle.SourceArtifactsIds) { TLLinksList links = _matrix.GetLinksAboveThresholdForSourceArtifact(query); links.Sort(); for (int i = 0; i < links.Count; i++) { if (_oracle.IsLinkAboveThreshold(query, links[i].TargetArtifactId)) { Results.Add(String.Format("{0}_{1}", query, links[i].TargetArtifactId), i); } } } }
/// <summary> /// Computes the precision-recall curve of the given similarity matrix using the answer matrix provided. /// </summary> protected override void ComputeImplementation() { _oracle.Threshold = 0; int correct = 0; TLLinksList links = _matrix.AllLinks; links.Sort(); Results = new SerializableDictionary <string, double>(); for (int linkNumber = 1; linkNumber <= links.Count; linkNumber++) { if (_oracle.IsLinkAboveThreshold(links[linkNumber - 1].SourceArtifactId, links[linkNumber - 1].TargetArtifactId)) { correct++; } Results.Add(String.Format(_precisionFormat, linkNumber), correct / (double)linkNumber); Results.Add(String.Format(_recallFormat, linkNumber), correct / (double)_oracle.Count); } }
/// <summary> /// Removes a percentage of links from the bottom of the list. /// </summary> /// <param name="links">Ranklist</param> /// <param name="percent">Percentage to remove</param> /// <returns>Trimmed ranklist</returns> public static TLLinksList RemoveBottomPercentage(TLLinksList links, double percent) { if (percent <= 0.0 || percent >= 1.0) { throw new DevelopmentKitException("Percentage level must be between 0 and 1."); } TLLinksList remaining = new TLLinksList(); links.Sort(); int endIndex = Convert.ToInt32(Math.Floor(links.Count * (1 - percent))) - 1; for (int i = 0; i < endIndex; i++) { TLSingleLink link = links[i]; remaining.Add(new TLSingleLink(link.SourceArtifactId, link.TargetArtifactId, link.Score)); } return(remaining); }
/// <summary> /// Called from MetricComputation /// </summary> protected override void ComputeImplementation() { Results = new SerializableDictionary <string, double>(); double sumOfPrecisions = 0.0; int currentLink = 0; int correctSoFar = 0; TLLinksList links = _matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { currentLink++; if (_oracle.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { correctSoFar++; sumOfPrecisions += correctSoFar / (double)currentLink; } } Results.Add("AveragePrecision", sumOfPrecisions / _oracle.AllLinks.Count); }
private static void WriteSims(ref Info info, CSMR13DataSet dataset, TLSimilarityMatrix oracle, string model) { TextWriter Output = File.CreateText(info.OutputDirectory + @"\CheckLinkOrder\" + SharedUtils.CleanFileName(dataset.Name) + "." + model + ".txt"); TLSimilarityMatrix sims = Similarities.Import(info.ResultsDirectory.FullName + @"\" + SharedUtils.CleanFileName(dataset.Name) + @"\sims\" + model + ".sims"); TLLinksList simList = sims.AllLinks; simList.Sort(); int pos = 1; foreach (TLSingleLink link in simList) { if (oracle.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { Output.WriteLine("[{0}]\t{1}\t{2}\t{3}", pos, link.SourceArtifactId, link.TargetArtifactId, link.Score); } pos++; } Output.Flush(); Output.Close(); }
public static TLLinksList GetLinksAtRecall(TLSimilarityMatrix sims, TLSimilarityMatrix oracle, RecallLevel level) { double totalCorrect = oracle.Count * RecallLevelUtil.RecallValue(level); int numCorrect = 0; TLLinksList list = new TLLinksList(); TLLinksList links = sims.AllLinks; links.Sort(); while (links.Count > 0 && numCorrect < totalCorrect) { TLSingleLink link = links[0]; if (oracle.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrect++; } list.Add(link); links.RemoveAt(0); } return(list); }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
public void ComputeTest() { string data = @"../../Data/SimpleCorpus."; TLArtifactsCollection source = Artifacts.Import(data + "input.source.txt"); TLArtifactsCollection target = Artifacts.Import(data + "input.target.txt"); TLSimilarityMatrix testsims = VSM.Compute(source, target); TLSimilarityMatrix realsims = Similarities.Import(data + "output.VSM.txt"); Assert.AreEqual(testsims.Count, realsims.Count); TLLinksList testlinks = testsims.AllLinks; TLLinksList reallinks = realsims.AllLinks; testlinks.Sort(); reallinks.Sort(); for (int i = 0; i < reallinks.Count; i++) { Assert.AreEqual(testlinks[i].SourceArtifactId, reallinks[i].SourceArtifactId); Assert.AreEqual(testlinks[i].TargetArtifactId, reallinks[i].TargetArtifactId); Assert.AreEqual(testlinks[i].Score, reallinks[i].Score, 0.000000001); } }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
/// <summary> /// Returns the top N scoring links in a matrix. /// </summary> /// <param name="matrix">Matrix</param> /// <param name="topN">Number of links to return</param> /// <returns>List of top N links</returns> public static TLLinksList GetTopNLinks(TLSimilarityMatrix matrix, int topN) { if (matrix.AllLinks.Count < topN) { throw new DevelopmentKitException("Matrix only has " + matrix.AllLinks.Count + " links (" + topN + " requested)."); } if (topN < 1) { throw new DevelopmentKitException("topN must be greater than 0."); } TLLinksList links = matrix.AllLinks; links.Sort(); TLLinksList newLinks = new TLLinksList(); for (int i = 0; i < topN; i++) { newLinks.Add(links[i]); } return(newLinks); }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List <TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return(sims); }
public SortedDictionary <string, double> Calculate(TLSimilarityMatrix resultMatrix, TLDataset dataset) { var answerSet = dataset.AnswerSet; var sourceArtifacts = dataset.SourceArtifacts; SortedDictionary <string, double> metricValues = new SortedDictionary <string, double>(); resultMatrix.Threshold = m_threshold; foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { int numberOfRelevant = answerSet.GetCountOfLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); double recall = 0.0; if (numberOfRelevant > 0) { TLLinksList resultsListForArtifact = resultMatrix.GetLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); resultsListForArtifact.Sort(); int numberOfCorrectlyRetrieved = 0; foreach (TLSingleLink link in resultsListForArtifact) { //check if this is relevant link if (answerSet.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numberOfCorrectlyRetrieved++; } } recall = (double)numberOfCorrectlyRetrieved / numberOfRelevant; metricValues.Add(sourceArtifact.Id, recall); } } resultMatrix.Threshold = 0.0; return(metricValues); }
/// <summary> /// Called from MetricComputation /// </summary> protected override void ComputeImplementation() { Results = new SerializableDictionary <string, double>(); foreach (string sourceID in _oracle.SourceArtifactsIds) { double sumOfPrecisions = 0.0; int currentLink = 0; int correctSoFar = 0; TLLinksList links = _matrix.GetLinksAboveThresholdForSourceArtifact(sourceID); links.Sort(); foreach (TLSingleLink link in links) { currentLink++; if (_oracle.IsLinkAboveThreshold(sourceID, link.TargetArtifactId)) { correctSoFar++; sumOfPrecisions += correctSoFar / (double)currentLink; } } Results.Add(sourceID, sumOfPrecisions / _oracle.GetCountOfLinksAboveThresholdForSourceArtifact(sourceID)); } }
public void BooleanQueriesAndTFIDFCorpusTest() { string inputData = Settings.Default.SimpleCorpusDir; string outputData = Path.Combine(inputData, "VSM"); TLArtifactsCollection source = Artifacts.ImportFile(Path.Combine(inputData, "source.txt")); TLArtifactsCollection target = Artifacts.ImportFile(Path.Combine(inputData, "target.txt")); TLSimilarityMatrix testsims = VSM.Compute(source, target, VSMWeightEnum.BooleanQueriesAndTFIDFCorpus); TLSimilarityMatrix realsims = Similarities.Import(Path.Combine(outputData, "output.txt")); Assert.AreEqual(testsims.Count, realsims.Count); TLLinksList testlinks = testsims.AllLinks; TLLinksList reallinks = realsims.AllLinks; testlinks.Sort(); reallinks.Sort(); for (int i = 0; i < reallinks.Count; i++) { Assert.AreEqual(testlinks[i].SourceArtifactId, reallinks[i].SourceArtifactId); Assert.AreEqual(testlinks[i].TargetArtifactId, reallinks[i].TargetArtifactId); Assert.AreEqual(testlinks[i].Score, reallinks[i].Score, Settings.Default.DoublePrecision); } }
public static void Export(TLArtifactsCollection queries, TLSimilarityMatrix sims, TLSimilarityMatrix gold, String allPath, String bestPath) { TextWriter all = new StreamWriter(allPath, false); TextWriter best = new StreamWriter(bestPath, false); TextWriter raw = new StreamWriter(allPath + ".csv", false); List <int> rawList = new List <int>(); foreach (String feature in queries.Keys) { TLLinksList simList = sims.GetLinksAboveThresholdForSourceArtifact(feature); TLLinksList goldList = gold.GetLinksAboveThresholdForSourceArtifact(feature); simList.Sort(); all.WriteLine(feature); best.WriteLine(feature); bool first = true; foreach (TLSingleLink link in goldList) { KeyValuePair <int, TLSingleLink> recovered = FindLink(simList, link); if (first) { best.WriteLine(recovered.Value.TargetArtifactId + "\t" + recovered.Key); first = false; } all.WriteLine(recovered.Value.TargetArtifactId + "\t" + recovered.Key); if (recovered.Key != -1) { rawList.Add(recovered.Key); } } } raw.WriteLine(String.Join("\n", rawList)); all.Flush(); all.Close(); best.Flush(); best.Close(); raw.Flush(); raw.Close(); }
private static double Calculate(string sourceArtifactId, TLLinksList resultList, TLSimilarityMatrix answerMatrix) { resultList.Sort(); int correct = 0; Double totalAvgPrecision = 0.0; int totalDocumentsRead = 0; foreach (TLSingleLink link in resultList) { totalDocumentsRead++; //check if this is relevant link if (answerMatrix.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { correct++; Double precisionAtCurrentIteration = (double)correct / totalDocumentsRead; totalAvgPrecision += precisionAtCurrentIteration; } } //int numberOfRelevant = answerMatrix.GetCountOfLinksAboveThresholdForSourceArtifact(sourceArtifactId); return(totalAvgPrecision); }
private static double Calculate(string sourceArtifactId, TLLinksList resultList, TLSimilarityMatrix answerMatrix) { resultList.Sort(); int correct = 0; Double totalAvgPrecision = 0.0; int totalDocumentsRead = 0; foreach (TLSingleLink link in resultList) { totalDocumentsRead++; //check if this is relevant link if (answerMatrix.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { correct++; Double precisionAtCurrentIteration = (double)correct / totalDocumentsRead; totalAvgPrecision += precisionAtCurrentIteration; } } //int numberOfRelevant = answerMatrix.GetCountOfLinksAboveThresholdForSourceArtifact(sourceArtifactId); return totalAvgPrecision; }
public SortedDictionary <string, double> Calculate(TLSimilarityMatrix resultMatrix, TLDataset dataset) { var answerSet = dataset.AnswerSet; var sourceArtifacts = dataset.SourceArtifacts; SortedDictionary <string, double> metricValues = new SortedDictionary <string, double>(); foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { int totalNumberOfCorrectLinks = answerSet.GetCountOfLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); double precision = 0.0; resultMatrix.Threshold = 0.0; TLLinksList resultsListForArtifact = resultMatrix.GetLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); resultsListForArtifact.Sort(); int numberOfCorrectlyRetrieved = 0; int numberOfRetrieved = 0; double scoreOfLastCorrectLink = 0; bool foundLastCorrectLink = false; foreach (TLSingleLink link in resultsListForArtifact) { numberOfRetrieved++; //if all correct links has not been found yet if (foundLastCorrectLink == false) { //check if this is relevant link if (answerSet.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numberOfCorrectlyRetrieved++; if (numberOfCorrectlyRetrieved == totalNumberOfCorrectLinks) { foundLastCorrectLink = true; scoreOfLastCorrectLink = answerSet.GetScoreForLink(link.SourceArtifactId, link.TargetArtifactId); } } } else if (foundLastCorrectLink) { //if all correct link were found // retrieve all the documents that have the same relevance score as the document with the last correct link double score = answerSet.GetScoreForLink(link.SourceArtifactId, link.TargetArtifactId); if (!score.Equals(scoreOfLastCorrectLink)) { break; } } } if (numberOfCorrectlyRetrieved != totalNumberOfCorrectLinks) { //if number of correctly retrieved links is not equal once results list was exhausted, //it means there are some links not retrieved with probability zero. the precision is calculated by taking all target documents count //because then also all documents with probability zero would have to be retrieved precision = (double)totalNumberOfCorrectLinks / dataset.TargetArtifacts.Count; metricValues.Add(sourceArtifact.Id, precision); } else if (numberOfRetrieved > 0) { precision = (double)numberOfCorrectlyRetrieved / numberOfRetrieved; metricValues.Add(sourceArtifact.Id, precision); } } return(metricValues); }
public static void Export(ref TLSimilarityMatrix sims, ref TLSimilarityMatrix goldset, Dictionary <int, string> qmap, string dir, string prefix) { TextWriter allall = new StreamWriter(dir + prefix + ".all.allmeasures", false); TextWriter allbest = new StreamWriter(dir + prefix + ".all.bestmeasures", false); TextWriter bugall = new StreamWriter(dir + prefix + ".bugs.allmeasures", false); TextWriter bugbest = new StreamWriter(dir + prefix + ".bugs.bestmeasures", false); TextWriter featall = new StreamWriter(dir + prefix + ".features.allmeasures", false); TextWriter featbest = new StreamWriter(dir + prefix + ".features.bestmeasures", false); TextWriter patchall = new StreamWriter(dir + prefix + ".patch.allmeasures", false); TextWriter patchbest = new StreamWriter(dir + prefix + ".patch.bestmeasures", false); sims.Threshold = Double.MinValue; foreach (KeyValuePair <int, string> qmapKVP in qmap) { TLLinksList simList = sims.GetLinksAboveThresholdForSourceArtifact(qmapKVP.Key.ToString()); TLLinksList goldList = goldset.GetLinksAboveThresholdForSourceArtifact(qmapKVP.Key.ToString()); simList.Sort(); allall.WriteLine(qmapKVP.Key.ToString()); allbest.WriteLine(qmapKVP.Key.ToString()); if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Bugs)) { bugall.WriteLine(qmapKVP.Key.ToString()); bugbest.WriteLine(qmapKVP.Key.ToString()); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Features)) { featall.WriteLine(qmapKVP.Key.ToString()); featbest.WriteLine(qmapKVP.Key.ToString()); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Patch)) { patchall.WriteLine(qmapKVP.Key.ToString()); patchbest.WriteLine(qmapKVP.Key.ToString()); } KeyValuePair <int, TLSingleLink> best = new KeyValuePair <int, TLSingleLink>(Int32.MaxValue, new TLSingleLink("null", "null", 0)); foreach (TLSingleLink link in goldList) { KeyValuePair <int, TLSingleLink> recovered = FindLink(simList, link); if (recovered.Key != -1 && recovered.Key < best.Key) { best = recovered; } allall.WriteLine(recovered.Value.TargetArtifactId + "\t" + recovered.Key); if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Bugs)) { bugall.WriteLine(recovered.Value.TargetArtifactId + "\t" + recovered.Key); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Features)) { featall.WriteLine(recovered.Value.TargetArtifactId + "\t" + recovered.Key); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Patch)) { patchall.WriteLine(recovered.Value.TargetArtifactId + "\t" + recovered.Key); } } allbest.WriteLine(best.Value.TargetArtifactId + "\t" + best.Key); if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Bugs)) { bugbest.WriteLine(best.Value.TargetArtifactId + "\t" + best.Key); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Features)) { featbest.WriteLine(best.Value.TargetArtifactId + "\t" + best.Key); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Patch)) { patchbest.WriteLine(best.Value.TargetArtifactId + "\t" + best.Key); } } allall.Flush(); allall.Close(); allbest.Flush(); allbest.Close(); bugall.Flush(); bugall.Close(); bugbest.Flush(); bugbest.Close(); featall.Flush(); featall.Close(); featbest.Flush(); featbest.Close(); patchall.Flush(); patchall.Close(); patchbest.Flush(); patchbest.Close(); }
public static DatasetResults Calculate(ref TLSimilarityMatrix sims, ref TLSimilarityMatrix goldset, Dictionary <int, string> qmap, string ModelName) { TLKeyValuePairsList allall = new TLKeyValuePairsList(); TLKeyValuePairsList allbest = new TLKeyValuePairsList(); TLKeyValuePairsList bugall = new TLKeyValuePairsList(); TLKeyValuePairsList bugbest = new TLKeyValuePairsList(); TLKeyValuePairsList featall = new TLKeyValuePairsList(); TLKeyValuePairsList featbest = new TLKeyValuePairsList(); TLKeyValuePairsList patchall = new TLKeyValuePairsList(); TLKeyValuePairsList patchbest = new TLKeyValuePairsList(); sims.Threshold = Double.MinValue; foreach (KeyValuePair <int, string> qmapKVP in qmap) { TLLinksList simList = sims.GetLinksAboveThresholdForSourceArtifact(qmapKVP.Key.ToString()); simList.Sort(); bool best = false; for (int i = 0; i < simList.Count; i++) { if (goldset.IsLinkAboveThreshold(simList[i].SourceArtifactId, simList[i].TargetArtifactId)) { KeyValuePair <string, double> recovered = new KeyValuePair <string, double>(simList[i].SourceArtifactId + "_" + simList[i].TargetArtifactId, i); allall.Add(recovered); if (!best) { allbest.Add(recovered); best = true; if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Bugs)) { bugbest.Add(recovered); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Features)) { featbest.Add(recovered); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Patch)) { patchbest.Add(recovered); } } if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Bugs)) { bugall.Add(recovered); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Features)) { featall.Add(recovered); } else if (qmapKVP.Value == Trace.GetFeatureSetType(FeatureSet.Patch)) { patchall.Add(recovered); } } } } List <SummaryData> alldata = new List <SummaryData>(); alldata.Add(CreateSummaryData(allall, "All (all)")); alldata.Add(CreateSummaryData(bugall, "Bugs (all)")); alldata.Add(CreateSummaryData(featall, "Features (all)")); alldata.Add(CreateSummaryData(patchall, "Patches (all)")); List <SummaryData> bestdata = new List <SummaryData>(); bestdata.Add(CreateSummaryData(allbest, "All (best)")); bestdata.Add(CreateSummaryData(bugbest, "Bugs (best)")); bestdata.Add(CreateSummaryData(featbest, "Features (best)")); bestdata.Add(CreateSummaryData(patchbest, "Patches (best)")); List <Metric> data = new List <Metric>(); data.Add(new EffectivenessMetric(alldata, 0.0, "none", ModelName + " all")); data.Add(new EffectivenessMetric(bestdata, 0.0, "none", ModelName + " best")); return(new DatasetResults("", data)); }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Removes a percentage of links from the top of the list. /// </summary> /// <param name="links">Ranklist</param> /// <param name="percent">Percentage to remove</param> /// <returns>Trimmed ranklist</returns> public static TLLinksList RemoveTopPercentage(TLLinksList links, double percent) { if (percent <= 0.0 || percent >= 1.0) { throw new DevelopmentKitException("Percentage level must be between 0 and 1."); } TLLinksList remaining = new TLLinksList(); links.Sort(); int startIndex = Convert.ToInt32(Math.Ceiling(links.Count * percent)) - 1; for (int i = startIndex; i < links.Count; i++) { TLSingleLink link = links[i]; remaining.Add(new TLSingleLink(link.SourceArtifactId, link.TargetArtifactId, link.Score)); } return remaining; }
public SortedDictionary <string, double> Calculate(TLSimilarityMatrix resultMatrix, TLDataset dataset) { var answerSet = dataset.AnswerSet; var sourceArtifacts = dataset.SourceArtifacts; SortedDictionary <string, double> metricValues = new SortedDictionary <string, double>(); foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { int numberOfRelevant = answerSet.GetCountOfLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); //?? double averagePrecision = 0.0; //do calculation only if there are relevant links if (numberOfRelevant > 0) { TLLinksList resultsListForArtifact = resultMatrix.GetLinksAboveThresholdForSourceArtifact(sourceArtifact.Id); resultsListForArtifact.Sort(); int numRetrieved = 0; int numCorrectlyRetrieved = 0; double sumPrecision = 0; int numSameRankPosition = 1; int sumSameRankPosition = 0; bool hasCorrectlyRetrieved = false; double lastSimilarityScore = -1; foreach (TLSingleLink link in resultsListForArtifact) { numRetrieved++; if (link.Score != lastSimilarityScore) { if (hasCorrectlyRetrieved) { double averageRankPosition = (double)sumSameRankPosition / numSameRankPosition; sumPrecision += (double)numCorrectlyRetrieved / averageRankPosition; } numSameRankPosition = 1; sumSameRankPosition = numRetrieved; hasCorrectlyRetrieved = false; } else { numSameRankPosition++; sumSameRankPosition += numRetrieved; } if (answerSet.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { numCorrectlyRetrieved++; hasCorrectlyRetrieved = true; } lastSimilarityScore = link.Score; } if (hasCorrectlyRetrieved) { double averageRankPosition = sumSameRankPosition / numSameRankPosition; sumPrecision += (double)numCorrectlyRetrieved / averageRankPosition; } averagePrecision = (double)sumPrecision / numberOfRelevant; metricValues.Add(sourceArtifact.Id, averagePrecision); } } return(metricValues); }