public void SimilarityMatrixRawSerializationTest() { string[] sources = new string[] { "source1", "source2", "source3", "source4", "source5", "source6", "source7", "source8", "source9", "source10" }; string[] targets = new string[] { "target1", "target2", "target3", "target4", "target5", "target6", "target7", "target8", "target9", "target10" }; TLSimilarityMatrix matrixIn = new TLSimilarityMatrix(); for (int i = 0; i < sources.Length; i++) { matrixIn.AddLink(sources[i], targets[i], (double)i); } BinaryWriter binWriter = new BinaryWriter(new MemoryStream()); BinaryReader binReader = new BinaryReader(binWriter.BaseStream); matrixIn.WriteData(binWriter); binReader.BaseStream.Position = 0; TLSimilarityMatrix matrixOut = new TLSimilarityMatrix(); matrixOut.ReadData(binReader); Assert.AreEqual(matrixIn.Count, matrixOut.Count); StringHashSet setIn = matrixIn.SourceArtifactsIds; StringHashSet setOut = matrixOut.SourceArtifactsIds; foreach (string artifact in setIn) { Assert.IsTrue(setOut.Contains(artifact)); } }
/// <summary> /// Computes cosine similarities between two TermDocumentMatrices. /// Cosine similarity is defined as (dot product) / (length * length) /// </summary> /// <param name="m1">Binary document matrix</param> /// <param name="m2">tf-idf weighted document matrix</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2); for (int i = 0; i < m1.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < m2.NumDocs; j++) { double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j)); if (lengthProduct == 0.0) { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Imports a file in the form (each line): /// SOURCE TARGET SCORE /// </summary> /// <param name="filename">Similarities file</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Import(String filename) { StreamReader file = new StreamReader(filename); TLSimilarityMatrix answer = new TLSimilarityMatrix(); String line; int num = 0; while ((line = file.ReadLine()) != null) { num++; if (String.IsNullOrWhiteSpace(line)) continue; try { String[] artifacts = line.Split(); String source = artifacts[0]; String target = artifacts[1]; double score = Convert.ToDouble(artifacts[2]); answer.AddLink(source, target, score); } catch (IndexOutOfRangeException e) { file.Close(); throw new InvalidDataException("Invalid data format on line " + num + " of file:" + filename, e); } } file.Close(); return answer; }
public static TLSimilarityMatrix CreateMatrix(TLLinksList list) { TLSimilarityMatrix matrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in list) { matrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } return matrix; }
/// <summary> /// Normalizes a similarity matrix /// </summary> /// <param name="matrix">Similarity matrix</param> /// <returns>Normalized similarity matrix</returns> public static TLSimilarityMatrix Normalize(TLSimilarityMatrix matrix) { TLSimilarityMatrix norm = new TLSimilarityMatrix(); double mean = TLSimilarityMatrixUtil.AverageSimilarity(matrix); double stdDev = TLSimilarityMatrixUtil.SimilarityStandardDeviation(matrix); foreach (TLSingleLink link in matrix.AllLinks) { norm.AddLink(link.SourceArtifactId, link.TargetArtifactId, (link.Score - mean) / stdDev); } return norm; }
private static void RemoveNonFeature(ref TLSimilarityMatrix sims, FeatureSet set, Dictionary<int, string> qmap) { TLSimilarityMatrix target = new TLSimilarityMatrix(); string feature = GetFeatureSetType(set); foreach (TLSingleLink link in sims.AllLinks) { if (qmap[Convert.ToInt32(link.SourceArtifactId)] == feature) { target.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } sims = target; }
public static void Split(ref TLSimilarityMatrix original, Dictionary<int, string> qmap, ref TLSimilarityMatrix bugs, ref TLSimilarityMatrix features, ref TLSimilarityMatrix patch) { foreach (TLSingleLink link in original.AllLinks) { string feature = qmap[Convert.ToInt32(link.SourceArtifactId)]; if (feature == Trace.GetFeatureSetType(FeatureSet.Bugs)) bugs.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); else if (feature == Trace.GetFeatureSetType(FeatureSet.Features)) features.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); else if (feature == Trace.GetFeatureSetType(FeatureSet.Patch)) patch.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } }
internal static TLSimilarityMatrix GenerateOracle(string rankFile, string mapFile) { Console.WriteLine("Generating oracle..."); IEnumerable<double> ranks = Generics.ImportDoubles(rankFile, false); IEnumerable<string> map = Generics.ImportStrings(mapFile); Assert.AreEqual(map.Count(), ranks.Count()); TLSimilarityMatrix oracle = new TLSimilarityMatrix(); for (int i = 0; i < map.Count(); i++) { oracle.AddLink("trace", map.ElementAt(i), ranks.ElementAt(i)); } return oracle; }
/// <summary> /// Performs an affine transformation on two similarity matrices. /// </summary> /// <param name="large">Large expert</param> /// <param name="small">Small expert</param> /// <param name="lambda">Weight given to large expert</param> /// <returns>Transformed similarities</returns> public static TLSimilarityMatrix Transform(TLSimilarityMatrix large, TLSimilarityMatrix small, double lambda) { TLSimilarityMatrix largeNormal = Normalize(large); TLSimilarityMatrix smallNormal = Normalize(small); TLSimilarityMatrix combined = new TLSimilarityMatrix(); foreach (TLSingleLink largeLink in largeNormal.AllLinks) { double smallLink = smallNormal.GetScoreForLink(largeLink.SourceArtifactId, largeLink.TargetArtifactId); combined.AddLink(largeLink.SourceArtifactId, largeLink.TargetArtifactId, Combine(largeLink.Score, smallLink, lambda)); } return combined; }
/// <summary> /// FORMAT /// ====== /// Line 1 - "","UC","CC","Similarity","Oracle","Precision","Recall","feedback" /// Line 2+ - values /// </summary> /// <param name="path"></param> /// <returns></returns> public static TLSimilarityMatrix Import(string path) { TLSimilarityMatrix matrix = new TLSimilarityMatrix(); matrix.Threshold = Double.MinValue; TextReader file = new StreamReader(path); file.ReadLine(); string line; while ((line = file.ReadLine()) != null) { string[] item = line.Split(new char[] { ',', '"' }, StringSplitOptions.RemoveEmptyEntries); matrix.AddLink(item[1], item[2], Convert.ToDouble(item[3])); } return matrix; }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix matrix, TLSimilarityMatrix relationships) { // create pseudo matrix for easy lookup // Dictionary<sourceID, Dictionary<targetID, score>> Dictionary<string, Dictionary<string, double>> storage = new Dictionary<string, Dictionary<string, double>>(); foreach (TLSingleLink link in matrix.AllLinks) { if (!storage.ContainsKey(link.SourceArtifactId)) { storage.Add(link.SourceArtifactId, new Dictionary<string, double>()); } storage[link.SourceArtifactId].Add(link.TargetArtifactId, link.Score); } #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(matrix); #endif // iterate over every (source, target) pair TLLinksList links = matrix.AllLinks; links.Sort(); foreach (TLSingleLink link in links) { // get the set of target artifacts related to link.TargetArtifactId // then update the value of (link.SourceArtifactId, relatedArtifact) by delta foreach (string relatedArtifact in relationships.GetSetOfTargetArtifactIdsAboveThresholdForSourceArtifact(link.TargetArtifactId)) { #if UseDelta storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * delta; #else storage[link.SourceArtifactId][relatedArtifact] += storage[link.SourceArtifactId][relatedArtifact] * 0.1; #endif } } // build new matrix TLLinksList newLinks = new TLLinksList(); foreach (string source in storage.Keys) { foreach (string target in storage[source].Keys) { newLinks.Add(new TLSingleLink(source, target, storage[source][target])); } } newLinks.Sort(); TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in newLinks) { newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } return newMatrix; }
public static TLSimilarityMatrix Import(string directory) { TLSimilarityMatrix matrix = new TLSimilarityMatrix(); foreach (String file in Directory.GetFiles(directory)) { String feature = Similarities.ExtractFeatureID(file); StreamReader links = new StreamReader(file); String link; while ((link = links.ReadLine()) != null) { matrix.AddLink(feature, link, 1); } links.Close(); } return matrix; }
public static TLSimilarityMatrix Compute(TLSimilarityMatrix sims, TLSimilarityMatrix relationships, TLSimilarityMatrix feedback) { // new matrix TLSimilarityMatrix newMatrix = new TLSimilarityMatrix(); #if UseDelta // compute delta double delta = SharedUtils.ComputeDelta(sims); #endif // make sure the entire list is sorted TLLinksList links = sims.AllLinks; links.Sort(); // end condition int correct = 0; // iterate over each source-target pair while (links.Count > 0 && correct < feedback.Count) { // get link at top of list TLSingleLink link = links[0]; // check feedback if (feedback.IsLinkAboveThreshold(link.SourceArtifactId, link.TargetArtifactId)) { correct++; // update related links for (int i = 1; i < links.Count; i++) { if (link.SourceArtifactId.Equals(links[i].SourceArtifactId) && relationships.IsLinkAboveThreshold(link.TargetArtifactId, links[i].TargetArtifactId)) { #if UseDelta links[i].Score += links[i].Score * delta; #else links[i].Score += links[i].Score * 0.1; #endif } } } // remove link newMatrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); links.RemoveAt(0); // reorder links links.Sort(); } return newMatrix; }
public static TLSimilarityMatrix Import(String directory, List<String> map) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (String file in Directory.GetFiles(directory)) { String feature = ExtractFeatureID(file); StreamReader idFile = new StreamReader(file); String line; while ((line = idFile.ReadLine()) != null) { String[] vars = line.Split(' '); sims.AddLink(feature, map[Convert.ToInt32(vars[0]) - 1], Convert.ToDouble(vars[2])); } idFile.Close(); } return sims; }
public static TLSimilarityMatrix Import(String directory) { TLSimilarityMatrix matrix = new TLSimilarityMatrix(); /* Each file is a link * filename.map - feature * Each line in file is a methodID */ foreach (String file in Directory.GetFiles(directory)) { String feature = ExtractFeatureID(file); StreamReader links = new StreamReader(file); String link; while ((link = links.ReadLine()) != null) { matrix.AddLink(feature, link, 1); } } return matrix; }
/// <summary> /// Computes Jensen-Shannon divergence on two TermDocumentMatrices /// </summary> /// <param name="source">Source artifacts collection</param> /// <param name="target">Target artifacts collection</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target) { List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target); TLSimilarityMatrix sims = new TLSimilarityMatrix(); for (int i = 0; i < matrices[0].NumDocs; i++) { TLLinksList list = new TLLinksList(); for (int j = 0; j < matrices[1].NumDocs; j++) { list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j), DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j)))); } list.Sort(); foreach (TLSingleLink link in list) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Imports an answer set from file in the form (each line): /// SOURCE TARGET1 TARGET2 ... /// </summary> /// <param name="filename">File location</param> /// <returns>Similarity matrix (link score 1)</returns> public static TLSimilarityMatrix Import(String filename) { StreamReader file = new StreamReader(filename); TLSimilarityMatrix answer = new TLSimilarityMatrix(); String line; while ((line = file.ReadLine()) != null) { String[] artifacts = line.Split(); String source = artifacts[0]; for (int i = 1; i < artifacts.Length; i++) { String target = artifacts[i].Trim(); if (target != "") { answer.AddLink(source, target, 1); } } } return answer; }
public static TLSimilarityMatrix Compute(NormalizedVectorCollection docs, NormalizedVector lengths, DocumentVectorCollection queries) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (KeyValuePair<string, DocumentVector> QueryKVP in queries) { /* * Since tf in queries are all 1, * we can assume this term is the sqrt of the size of the dictionary */ double qVal = Math.Sqrt(QueryKVP.Value.Count); foreach (KeyValuePair<string, NormalizedVector> DocKVP in docs) { double dVal = lengths[DocKVP.Key]; double qdVec = ComputeProduct(QueryKVP.Value, DocKVP.Value); sims.AddLink(QueryKVP.Key, DocKVP.Key, qdVec / (qVal * dVal)); } } return sims; }
/// <summary> /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths. /// </summary> /// <param name="sourceArtifacts">The source artifacts.</param> /// <param name="targetArtifacts">The target artifacts.</param> /// <param name="dict">The dict.</param> /// <param name="ancTermsWeights">The anc terms weights.</param> /// <param name="config">The config.</param> /// <returns>Similarity matrix with links between source and target artifacts</returns> private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts, TLArtifactsCollection targetArtifacts, TLDictionaryIndex dict, TLKeyValuePairsList ancTermsWeights, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received source artifacts are null!"); } if (targetArtifacts == null) { throw new ComponentException("Received target artifacts are null!"); } if (dict == null) { throw new ComponentException("Received dictionary index is null!"); } if (ancTermsWeights == null) { throw new ComponentException("Received 'ancTermsWeights' is null!"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List<Result> results; results = searcher.search(query, dict, PrepareANCData(ancTermsWeights)); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return similarityMatrix; }
/// <summary> /// Collapses overloaded source artifacts, assigning the best score. /// </summary> /// <param name="matrix">Similarities</param> /// <returns>Collapsed artifacts</returns> public static TLSimilarityMatrix CollapseOverloadedTargets(TLSimilarityMatrix matrix) { Dictionary<string, Dictionary<string, double>> pseudomatrix = new Dictionary<string, Dictionary<string, double>>(); foreach (TLSingleLink link in matrix.AllLinks) { if (!pseudomatrix.ContainsKey(link.SourceArtifactId)) { pseudomatrix.Add(link.SourceArtifactId, new Dictionary<string,double>()); } int startIndex = link.TargetArtifactId.IndexOf('('); string target = (startIndex > 0) ? link.TargetArtifactId.Substring(0, startIndex) : link.TargetArtifactId; if (!pseudomatrix[link.SourceArtifactId].ContainsKey(target)) { pseudomatrix[link.SourceArtifactId].Add(target, link.Score); } else { if (link.Score > pseudomatrix[link.SourceArtifactId][target]) { pseudomatrix[link.SourceArtifactId][target] = link.Score; } } } TLSimilarityMatrix collapsedMatrix = new TLSimilarityMatrix(); foreach (string sourceID in pseudomatrix.Keys) { foreach (string targetID in pseudomatrix[sourceID].Keys) { collapsedMatrix.AddLink(sourceID, targetID, pseudomatrix[sourceID][targetID]); } } return collapsedMatrix; }
/// <summary> /// Extracts links containing the given artifact IDs from a similarity matrix. /// </summary> /// <param name="original">Original matrix</param> /// <param name="artifactIDs">List of artifact IDs</param> /// <param name="ignoreParameters">Flag to ignore parameter overloads and compare only method names.</param> /// <returns>Extracted links</returns> public static TLLinksList ExtractLinks(TLLinksList original, IEnumerable<string> artifactIDs, bool ignoreParameters) { TLSimilarityMatrix matrix = new TLSimilarityMatrix(); foreach (TLSingleLink link in original) { string sourceID = (ignoreParameters && link.SourceArtifactId.IndexOf('(') > 0) ? link.SourceArtifactId.Substring(0, link.SourceArtifactId.IndexOf('(')) : link.SourceArtifactId; string targetID = (ignoreParameters && link.TargetArtifactId.IndexOf('(') > 0) ? link.TargetArtifactId.Substring(0, link.TargetArtifactId.IndexOf('(')) : link.TargetArtifactId; if (artifactIDs.Contains(sourceID) || artifactIDs.Contains(targetID)) { matrix.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return matrix.AllLinks; }
/// <summary> /// Import script results /// </summary> /// <param name="result">RScriptResults object</param> /// <returns>Script results</returns> public override object ImportResults(RScriptResult result) { TextReader rfile = new StreamReader(_outputFile); string rawdata = rfile.ReadToEnd(); rfile.Close(); TLSimilarityMatrix matrix = new TLSimilarityMatrix(); string[] sims = rawdata.Remove(0, 2).Replace(")", String.Empty).Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); string[] edges = Generics.ImportStrings(_info.Edges); if (sims.Length != edges.Length) throw new RDataException("Results are incorrect size: " + sims.Length + " vs " + edges.Length); for (int i = 0; i < sims.Length; i++) { string[] split = edges[i].Split(); matrix.AddLink(_corpus.Map[Convert.ToInt32(split[0])], _corpus.Map[Convert.ToInt32(split[1])], Convert.ToDouble(sims[i])); } //int src = 0; //int tgt = _source.DocMap.Count; //if (sims.Length != _source.DocMap.Count * _target.DocMap.Count) //{ // throw new RDataException("Results are incorrect size: " + sims.Length + " vs " + (_source.DocMap.Count * _target.DocMap.Count)); //} //foreach (string sim in sims) //{ // matrix.AddLink(_source.DocMap[src], _target.DocMap[tgt - _source.DocMap.Count], Convert.ToDouble(sim.Trim())); // tgt++; // if (tgt == _source.DocMap.Count + _target.DocMap.Count) // { // tgt = _source.DocMap.Count; // src++; // } //} return matrix; }
/// <summary> /// Imports the answer set without validation against source and target artifacts /// </summary> /// <param name="filepath">The filepath.</param> /// <param name="logger">The logger.</param> /// <param name="trimValues">if set to <c>true</c> [trim values].</param> /// <returns></returns> public static TLSimilarityMatrix ImportAnswerSet(string filepath, ComponentLogger logger, bool trimValues) { string friendlyAnswerSetFilename = System.IO.Path.GetFileName(filepath); TLSimilarityMatrix answerSet = new TLSimilarityMatrix(); XPathDocument doc = new XPathDocument(filepath); XPathNavigator nav = doc.CreateNavigator(); //read collection info XPathNavigator iter = nav.SelectSingleNode("/answer_set/answer_info/source_artifacts_collection"); string source_artifacts_collection_id = iter.Value; iter = nav.SelectSingleNode("/answer_set/answer_info/target_artifacts_collection"); string target_artifacts_collection_id = iter.Value; XPathNodeIterator linksIterator = nav.Select("/answer_set/links/link"); string source_artifact_id; string target_artifact_id; double confidence_score; while (linksIterator.MoveNext()) { // Parse Source Artifact Id iter = linksIterator.Current.SelectSingleNode("source_artifact_id"); if (iter == null) { throw new XmlException(String.Format("The source_artifact_id has not been provided for the link. File location: {0}", filepath)); } source_artifact_id = iter.Value; if (trimValues) { source_artifact_id = source_artifact_id.Trim(); } // Parse Target Artifact Id iter = linksIterator.Current.SelectSingleNode("target_artifact_id"); if (iter == null) { throw new XmlException(String.Format("The target_artifact_id has not been provided for the link. File location: {0}", filepath)); } target_artifact_id = iter.Value; if (trimValues) { target_artifact_id = target_artifact_id.Trim(); } //Parse confidence score iter = linksIterator.Current.SelectSingleNode("confidence_score"); if (iter == null) { //if confidence score is not provided set it to default value 1 confidence_score = 1.0; } else { string tmpValue = iter.Value; if (trimValues) tmpValue = tmpValue.Trim(); if (double.TryParse(tmpValue, out confidence_score) == false) { throw new XmlException(String.Format("The confidence score provided for link from source artifact {0} to target artifact is in incorrect format {1}. File location: {2}", source_artifact_id, target_artifact_id, filepath)); } } answerSet.AddLink(source_artifact_id, target_artifact_id, confidence_score); } return answerSet; }
/// <summary> /// Imports the answer set. /// </summary> /// <param name="filepath">The filepath.</param> /// <param name="sourceArtifacts">The source artifacts.</param> /// <param name="sourceArtifactsFilePath">The source artifacts file path.</param> /// <param name="targetArtifacts">The target artifacts.</param> /// <param name="targetArtifactsFilePath">The target artifacts file path.</param> /// <param name="logger">The logger.</param> /// <param name="trimValues">if set to <c>true</c> [trim values].</param> /// <returns></returns> public static TLSimilarityMatrix ImportAnswerSet(string filepath, TLArtifactsCollection sourceArtifacts, string sourceArtifactsFilePath, TLArtifactsCollection targetArtifacts, string targetArtifactsFilePath, ComponentLogger logger, bool trimValues) { string friendlyAnswerSetFilename = System.IO.Path.GetFileName(filepath); string friendlySourceArtifactsFilename = System.IO.Path.GetFileName(sourceArtifactsFilePath); string friendlyTargetArtifactsFilename = System.IO.Path.GetFileName(targetArtifactsFilePath); TLSimilarityMatrix answerSet = new TLSimilarityMatrix(); XPathDocument doc = new XPathDocument(filepath); XPathNavigator nav = doc.CreateNavigator(); //read collection info XPathNavigator iter = nav.SelectSingleNode("/answer_set/answer_info/source_artifacts_collection"); string source_artifacts_collection_id = iter.Value; if (source_artifacts_collection_id.Equals(sourceArtifacts.CollectionId) == false) { throw new ArgumentException(String.Format("The answer set refers to source artifact collection with id '{0}', while loaded artifacts collection has different id '{1}'. Importing answer set from {2}", source_artifacts_collection_id, sourceArtifacts.CollectionId, filepath)); } iter = nav.SelectSingleNode("/answer_set/answer_info/target_artifacts_collection"); string target_artifacts_collection_id = iter.Value; if (target_artifacts_collection_id.Equals(targetArtifacts.CollectionId) == false) { throw new ArgumentException(String.Format("The answer set refers to target artifact collection with id '{0}', while loaded artifacts collection has different id '{1}'. Importing answer set from {2}", target_artifacts_collection_id, targetArtifacts.CollectionId, filepath)); } XPathNodeIterator linksIterator = nav.Select("/answer_set/links/link"); string source_artifact_id; string target_artifact_id; double confidence_score; while (linksIterator.MoveNext()) { // Parse Source Artifact Id iter = linksIterator.Current.SelectSingleNode("source_artifact_id"); if (iter == null) { throw new XmlException(String.Format("The source_artifact_id has not been provided for the link. File location: {0}", filepath)); } source_artifact_id = iter.Value; if (trimValues) { source_artifact_id = source_artifact_id.Trim(); } if (sourceArtifacts.ContainsKey(source_artifact_id) == false) { logger.Warn(String.Format("The source artifact id '{0}' referenced in the answer set {1} has not been found in the source artifacts {2}. Therefore, this link has been removed in this experiment.", source_artifact_id, friendlyAnswerSetFilename, friendlySourceArtifactsFilename)); } // Parse Target Artifact Id iter = linksIterator.Current.SelectSingleNode("target_artifact_id"); if (iter == null) { throw new XmlException(String.Format("The target_artifact_id has not been provided for the link. File location: {0}", filepath)); } target_artifact_id = iter.Value; if (trimValues) { target_artifact_id = target_artifact_id.Trim(); } if (targetArtifacts.ContainsKey(target_artifact_id) == false) { logger.Warn(String.Format("The target artifact id '{0}' referenced in the answer set {1} has not been found in the target artifacts {2}. Therefore, this link has been removed in this experiment.", target_artifact_id, friendlyAnswerSetFilename, friendlyTargetArtifactsFilename)); } //Parse confidence score iter = linksIterator.Current.SelectSingleNode("confidence_score"); if (iter == null) { //if confidence score is not provided set it to default value 1 confidence_score = 1.0; } else { string tmpValue = iter.Value; if (trimValues) tmpValue = tmpValue.Trim(); if (double.TryParse(tmpValue, out confidence_score) == false) { throw new XmlException(String.Format("The confidence score provided for link from source artifact {0} to target artifact is in incorrect format {1}. File location: {2}", source_artifact_id, target_artifact_id, filepath)); } } answerSet.AddLink(source_artifact_id, target_artifact_id, confidence_score); } return answerSet; }
private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received null sourceArtifacts"); } if (dict == null) { throw new ComponentException("Received null dictionaryIndex"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List<Result> results; results = searcher.search(query, dict); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return similarityMatrix; }
/// <summary> /// Computes cosine similarities between a set of boolean document vectors and a tfidf weighted corpus /// </summary> /// <param name="ids">Boolean document vectors</param> /// <param name="tfidf">tf-idf weighted document vectors</param> /// <returns>Similarity matrix</returns> private static TLSimilarityMatrix ComputeSimilarities(TermDocumentMatrix ids, TermDocumentMatrix tfidf) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(ids, tfidf); for (int i = 0; i < ids.NumDocs; i++) { TLLinksList links = new TLLinksList(); for (int j = 0; j < tfidf.NumDocs; j++) { double product = 0.0; double asquared = 0.0; double bsquared = 0.0; for (int k = 0; k < matrices[0].NumTerms; k++) { double a = matrices[0][i, k]; double b = matrices[1][j, k]; product += (a * b); asquared += Math.Pow(a, 2); bsquared += Math.Pow(b, 2); } double cross = Math.Sqrt(asquared) * Math.Sqrt(bsquared); if (cross == 0.0) { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), 0.0)); } else { links.Add(new TLSingleLink(ids.GetDocumentName(i), tfidf.GetDocumentName(j), product / cross)); } } links.Sort(); foreach (TLSingleLink link in links) { sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score); } } return sims; }
/// <summary> /// Imports script results /// </summary> /// <param name="result">RScriptResults object</param> /// <returns>Script results</returns> public override object ImportResults(RScriptResult result) { TextReader rfile = new StreamReader(_outputFile); string rawdata = rfile.ReadToEnd(); rfile.Close(); TLSimilarityMatrix matrix = new TLSimilarityMatrix(); string[] sims = rawdata.Remove(0,2).Replace(")", String.Empty).Split(new char[] {','}, StringSplitOptions.RemoveEmptyEntries); int src = 0; int tgt = _source.DocMap.Count; if (sims.Length != _source.DocMap.Count * _target.DocMap.Count) { throw new RDataException("Results are incorrect size: " + sims.Length + " vs " + (_source.DocMap.Count * _target.DocMap.Count)); } foreach (string sim in sims) { matrix.AddLink(_source.DocMap[src], _target.DocMap[tgt - _source.DocMap.Count], Convert.ToDouble(sim.Trim())); tgt++; if (tgt == _source.DocMap.Count + _target.DocMap.Count) { tgt = _source.DocMap.Count; src++; } } return matrix; }
public void RunBeforeEachTest() { oracle = new TLSimilarityMatrix(); oracle.AddLink("A", "B*", 1); oracle.AddLink("A", "C*", 1); oracle.AddLink("A", "D*", 1); sims = new TLSimilarityMatrix(); /* Sorted order: * sims.AddLink("A", "B*", 10); * sims.AddLink("A", "E", 9); * sims.AddLink("A", "F", 8); * sims.AddLink("A", "C*", 7); * sims.AddLink("A", "G", 6); * sims.AddLink("A", "H", 5); * sims.AddLink("A", "I", 4); * sims.AddLink("A", "J", 3); * sims.AddLink("A", "D*", 2); * sims.AddLink("A", "K", 1); */ sims.AddLink("A", "G", 6); sims.AddLink("A", "K", 1); sims.AddLink("A", "B*", 10); sims.AddLink("A", "E", 9); sims.AddLink("A", "J", 3); sims.AddLink("A", "F", 8); sims.AddLink("A", "C*", 7); sims.AddLink("A", "H", 5); sims.AddLink("A", "D*", 2); sims.AddLink("A", "I", 4); }
/// <summary> /// Imports an oracle from a directory of files. /// Each file is a source artifact containing targets on each line. /// </summary> /// <param name="directory"></param> /// <returns></returns> public static TLSimilarityMatrix ImportDirectory(string directory) { TLSimilarityMatrix oracle = new TLSimilarityMatrix(); foreach (string file in Directory.GetFiles(directory)) { string id = Path.GetFileName(file); TextReader fReader = new StreamReader(file); string line; while ((line = fReader.ReadLine()) != null) { if (String.IsNullOrWhiteSpace(line)) continue; oracle.AddLink(id, line, 1); } } return oracle; }
/// <summary> /// Import script results /// </summary> /// <param name="result">RScriptResults object</param> /// <returns>Script results</returns> public override object ImportResults(RScriptResult result) { // index = id - 1 string[] ids = Generics.ImportStrings(_mapFile); TextReader resultsMatrix = new StreamReader(_outputFile); TLSimilarityMatrix matrix = new TLSimilarityMatrix(); string[] sources = resultsMatrix.ReadLine().Split(); string line; while ((line = resultsMatrix.ReadLine()) != null) { if (String.IsNullOrWhiteSpace(line)) continue; // [0] target id, [x+] source sims index = x - 1 string[] entries = line.Split(); string entry = ids[Convert.ToInt32(entries[0]) - 1]; for (int i = 0; i < sources.Length; i++) { matrix.AddLink(ids[Convert.ToInt32(sources[i]) - 1], entry, Convert.ToDouble(entries[i + 1])); } } resultsMatrix.Close(); return matrix; }