/// <summary> /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths. /// </summary> /// <param name="sourceArtifacts">The source artifacts.</param> /// <param name="targetArtifacts">The target artifacts.</param> /// <param name="dict">The dict.</param> /// <param name="ancTermsWeights">The anc terms weights.</param> /// <param name="config">The config.</param> /// <returns>Similarity matrix with links between source and target artifacts</returns> private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts, TLArtifactsCollection targetArtifacts, TLDictionaryIndex dict, TLKeyValuePairsList ancTermsWeights, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received source artifacts are null!"); } if (targetArtifacts == null) { throw new ComponentException("Received target artifacts are null!"); } if (dict == null) { throw new ComponentException("Received dictionary index is null!"); } if (ancTermsWeights == null) { throw new ComponentException("Received 'ancTermsWeights' is null!"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List<Result> results; results = searcher.search(query, dict, PrepareANCData(ancTermsWeights)); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return similarityMatrix; }
public void EmptyTargetArtifactsTest() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); sourceArtifacts.Add(new TLArtifact("id", "text")); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); dictionary.AddTermEntry("term", 1, 1, 1); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); if (simMat == null || simMat.Count != 0) { Assert.Fail("Similarity Matrix should still be created but have nothing in it"); } }
private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received null sourceArtifacts"); } if (dict == null) { throw new ComponentException("Received null dictionaryIndex"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List<Result> results; results = searcher.search(query, dict); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return similarityMatrix; }
public void IncorrectInputSourceArtifactsType() { TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); targetArtifacts.Add(new TLArtifact("id1", "text2")); TLDictionaryIndex dictionary = new TLDictionaryIndex(); dictionary.AddTermEntry("term", 1,1,1); Workspace.Store("sourceArtifacts", "incorrect type"); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); }
public void NullSourceArtifactsTest() { TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); Workspace.Store("sourceArtifacts", null); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); }
public void TestTracingOfComponent() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); // TODO: add inputs that matter sourceArtifacts.Add(new TLArtifact("id1", "first text")); sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with")); sourceArtifacts.Add(new TLArtifact("id3", "some more text")); targetArtifacts.Add(new TLArtifact("id1", "hello world")); targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed")); targetArtifacts.Add(new TLArtifact("id3", "yep")); targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop")); dictionary.AddTermEntry("term", 3, 3, 0.2); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); // TODO: add tests to make sure the output is correctly formatted Assert.Fail(); }
public List<Result> search(String query, TLDictionaryIndex dict, Dictionary<string, double> ancTermsWeights) { // Variables List<Result> results; Dictionary<string, double> intermediateResults; Dictionary<string, int> queryTermFrequency; Dictionary<string, double> queryTermWeight; double queryVectorLength; // Initializes the data structures results = new List<Result>(); // Result array intermediateResults = new Dictionary<string, double>(); // Where the intermediate results of the query are kept. queryTermFrequency = new Dictionary<string, int>(); // Keeps track of term frequencies queryTermWeight = new Dictionary<string, double>(); // Keeps track of term weights queryVectorLength = 0.0; // The document vector length of the query // The query is broken down into tokens string[] queryTerms = query.Split(new char[] { ' ' } ); // Iterates over each query term to compute the term frequency foreach (string qterm in queryTerms) { // It only cares about those words that are in the dictionary if (dict.ContainsTermEntry(qterm)) { if (!queryTermFrequency.ContainsKey(qterm)) { // First time the query word is encountered queryTermFrequency.Add(qterm, 1); } else { // The query word is already there, so the frequency gets increased queryTermFrequency[qterm] += 1; } } } // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms foreach (string qterm in queryTermFrequency.Keys) { // Gets the Term from the dictionary TLTermEntry term = dict.GetTermEntry(qterm); // It computes the weight of a term - IE the frequency TIMES the term's specificity. // Note: the specifity of the term is stored in the weight. // For the basic dictionary this is just 1 // For the tf-idf dictionary this is the idf // For the signal-noise this is the signal //double weight = queryTermFrequency[qterm] * term.Weight; double ancWeight; if(ancTermsWeights.TryGetValue(qterm, out ancWeight) == false) ancWeight = 1.0; double weight = queryTermFrequency[qterm] * ancWeight; queryTermWeight.Add(qterm, weight); // Updates the document vector length of the query queryVectorLength += Math.Pow(weight, 2); // It now iterates over all the postings that have this term foreach (TLPosting posting in term.Postings) { string docId = posting.ArtifactId; // Calculates the product of the query times the posting for this particular term double r = queryTermWeight[qterm] * posting.Weight; if (intermediateResults.ContainsKey(docId)) { intermediateResults[docId] += r; } else { intermediateResults.Add(docId, r); } } } // The document vector lenght for the query is the square root of the sum of the squares of the term weights queryVectorLength = Math.Sqrt(queryVectorLength); // It iterates over the intermediate results to create the final array that is returned to the user foreach (string docId in intermediateResults.Keys) { // Result r = new ResultObj(docId, intermediateResults.get(docId)); double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId)); Result r = new Result(docId, similarity); results.Add(r); } // Sorts the results results.Sort(); return results; }
/** * input is a map of artifact.Id and processed text of each artifact * */ public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables TLTermEntry termEntry; TLPosting posting; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary<string, double> documentVectorLength = new Dictionary<string, double>(); // Creates the dictionary TLDictionaryIndex dict = new TLDictionaryIndex(); // Iterates over all the documents foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (terms.Length == 0) { logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id)); } // Iterates over all the terms foreach (string t in terms) { // Checks if that term has already a posting if(!dict.ContainsTermEntry(t)) { // New term termEntry = dict.AddTermEntry(t, 1, 1, 1.0); posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing term termEntry = dict.GetTermEntry(t); termEntry.TotalFrequencyAcrossArtifacts += 1; termEntry.Weight = 1.0; // Checks if there is already a posting for this document if (!termEntry.ContainsPosting(artifact.Id)) { // New posting termEntry.NumberOfArtifactsContainingTerm += 1; posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing posting posting = termEntry.GetPosting(artifact.Id); posting.Frequency += 1; posting.Weight += 1.0; } } } } string artId; // Now that all the counts are in, it calculates the document vector weights foreach (TLTermEntry t in dict.TermEntries) { foreach (TLPosting p in t.Postings) { artId = p.ArtifactId; vectorLength = Math.Pow(p.Frequency, 2); if (documentVectorLength.ContainsKey(artId)) { // The document has other terms vectorLength += documentVectorLength[artId]; } documentVectorLength[artId] = vectorLength; } } // Finally, we need to get the square root of all entries in the document vector length foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { if (documentVectorLength.ContainsKey(artifact.Id)) { vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]); // Here we update the document vector length of the dictionary - not the internal structure any more dict.SetDocumentVectorWeight(artifact.Id, vectorLength); } } return dict; }