public void EmptyDictionaryIndexTest() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); sourceArtifacts.Add(new TLArtifact("id", "text")); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); targetArtifacts.Add(new TLArtifact("id", "text")); TLDictionaryIndex dictionary = new TLDictionaryIndex(); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); if (simMat == null || simMat.Count != 0) { Assert.Fail("Similarity Matrix should still be created but have nothing in it"); } }
public void TestTracingOfComponent() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); // TODO: add inputs that matter sourceArtifacts.Add(new TLArtifact("id1", "first text")); sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with")); sourceArtifacts.Add(new TLArtifact("id3", "some more text")); targetArtifacts.Add(new TLArtifact("id1", "hello world")); targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed")); targetArtifacts.Add(new TLArtifact("id3", "yep")); targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop")); dictionary.AddTermEntry("term", 3, 3, 0.2); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); // TODO: add tests to make sure the output is correctly formatted Assert.Fail(); }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLDictionaryIndex dict = BuildDictionary(listOfArtifacts, Logger); Workspace.Store("dictionaryIndex", dict); }
public override void Compute() { TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("sourceArtifacts"); TLDictionaryIndex dict = (TLDictionaryIndex)Workspace.Load("dictionaryIndex"); TracerConfig config = (TracerConfig)this.Configuration; TLSimilarityMatrix similarityMatrix = Process(sourceArtifacts, dict, config); Workspace.Store("similarityMatrix", similarityMatrix); }
private static TLDictionaryIndex BuildDictionary(TLArtifactsCollection listOfArtifacts, ComponentLogger logger) { if (listOfArtifacts == null) { throw new ComponentException("Received null listOfArtifacts"); } TLDictionaryIndex dict = TFIDFIndexBuilder.build(listOfArtifacts, logger); return(dict); }
public void NullSourceArtifactsTest() { TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); Workspace.Store("sourceArtifacts", null); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); }
/// <summary> /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths. /// </summary> /// <param name="sourceArtifacts">The source artifacts.</param> /// <param name="targetArtifacts">The target artifacts.</param> /// <param name="dict">The dict.</param> /// <param name="ancTermsWeights">The anc terms weights.</param> /// <param name="config">The config.</param> /// <returns>Similarity matrix with links between source and target artifacts</returns> private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts, TLArtifactsCollection targetArtifacts, TLDictionaryIndex dict, TLKeyValuePairsList ancTermsWeights, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received source artifacts are null!"); } if (targetArtifacts == null) { throw new ComponentException("Received target artifacts are null!"); } if (dict == null) { throw new ComponentException("Received dictionary index is null!"); } if (ancTermsWeights == null) { throw new ComponentException("Received 'ancTermsWeights' is null!"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List <Result> results; results = searcher.search(query, dict, PrepareANCData(ancTermsWeights)); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return(similarityMatrix); }
public override void Compute() { Logger.Trace("Start component ANCTracerComponent"); TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("sourceArtifacts"); TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("targetArtifacts"); TLDictionaryIndex dict = (TLDictionaryIndex)Workspace.Load("dictionaryIndex"); TLKeyValuePairsList ancTermsWeights = (TLKeyValuePairsList)Workspace.Load("ancTermsWeights"); TracerConfig config = (TracerConfig)this.Configuration; TLSimilarityMatrix similarityMatrix = ComputeTraceability(sourceArtifacts, targetArtifacts, dict, ancTermsWeights, config); Workspace.Store("similarityMatrix", similarityMatrix); Logger.Trace("Completed component ANCTracerComponent"); }
public void IncorrectInputSourceArtifactsType() { TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); targetArtifacts.Add(new TLArtifact("id1", "text2")); TLDictionaryIndex dictionary = new TLDictionaryIndex(); dictionary.AddTermEntry("term", 1, 1, 1); Workspace.Store("sourceArtifacts", "incorrect type"); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); }
private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received null sourceArtifacts"); } if (dict == null) { throw new ComponentException("Received null dictionaryIndex"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List <Result> results; results = searcher.search(query, dict); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return(similarityMatrix); }
/** * input is a map of artifact.Id and processed text of each artifact * */ public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables TLTermEntry termEntry; TLPosting posting; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary <string, double> documentVectorLength = new Dictionary <string, double>(); // Creates the dictionary TLDictionaryIndex dict = new TLDictionaryIndex(); // Iterates over all the documents foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (terms.Length == 0) { logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id)); } // Iterates over all the terms foreach (string t in terms) { // Checks if that term has already a posting if (!dict.ContainsTermEntry(t)) { // New term termEntry = dict.AddTermEntry(t, 1, 1, 1.0); posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing term termEntry = dict.GetTermEntry(t); termEntry.TotalFrequencyAcrossArtifacts += 1; termEntry.Weight = 1.0; // Checks if there is already a posting for this document if (!termEntry.ContainsPosting(artifact.Id)) { // New posting termEntry.NumberOfArtifactsContainingTerm += 1; posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing posting posting = termEntry.GetPosting(artifact.Id); posting.Frequency += 1; posting.Weight += 1.0; } } } } string artId; // Now that all the counts are in, it calculates the document vector weights foreach (TLTermEntry t in dict.TermEntries) { foreach (TLPosting p in t.Postings) { artId = p.ArtifactId; vectorLength = Math.Pow(p.Frequency, 2); if (documentVectorLength.ContainsKey(artId)) { // The document has other terms vectorLength += documentVectorLength[artId]; } documentVectorLength[artId] = vectorLength; } } // Finally, we need to get the square root of all entries in the document vector length foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { if (documentVectorLength.ContainsKey(artifact.Id)) { vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]); // Here we update the document vector length of the dictionary - not the internal structure any more dict.SetDocumentVectorWeight(artifact.Id, vectorLength); } } return(dict); }
/** * input is a map of artifact.Id and processed text of each artifact * */ public static new TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables int N = setOfProcessedDocuments.Count; // Total Number of Documents double idf; String docId; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary <string, double> documentVectorLength = new Dictionary <string, double>(); // It starts off by calling the parent method, which will calculate the basic frequencies TLDictionaryIndex dict = BasicIndexBuilder.build(setOfProcessedDocuments, logger); // Iterates over all the terms foreach (TLTermEntry term in dict.TermEntries) { // Calculates the idf for each term - and stores this in the weight of the term - for weighing queries later idf = Math.Log10(N / ((double)term.NumberOfArtifactsContainingTerm)); term.Weight = idf; // Iterates over all the postings foreach (TLPosting posting in term.Postings) { // Multiplies each term weight by the idf double newWeight = posting.Frequency * idf; posting.Weight = newWeight; // Updates the document vector length docId = posting.ArtifactId; vectorLength = Math.Pow(newWeight, 2); if (documentVectorLength.ContainsKey(docId)) { // The document has other terms vectorLength += documentVectorLength[docId]; } documentVectorLength[docId] = vectorLength; } } // Now, we need to get the square root of all entries in the document vector length foreach (TLArtifact d in setOfProcessedDocuments.Values) { docId = d.Id; if (documentVectorLength.ContainsKey(docId)) { vectorLength = Math.Sqrt(documentVectorLength[docId]); documentVectorLength[docId] = vectorLength; // Here we update the document vector length of the dictionary - not the internal structure anymore dict.SetDocumentVectorWeight(docId, vectorLength); } } // Lastly, we normalize all the term weights foreach (TLTermEntry term in dict.TermEntries) { foreach (TLPosting posting in term.Postings) { vectorLength = documentVectorLength[posting.ArtifactId]; posting.Weight = (posting.Weight / vectorLength); } } return(dict); }
public List <Result> search(String query, TLDictionaryIndex dict) { // Variables List <Result> results; Dictionary <string, double> intermediateResults; Dictionary <string, int> queryTermFrequency; Dictionary <string, double> queryTermWeight; double queryVectorLength; // Initializes the data structures results = new List <Result>(); // Result array intermediateResults = new Dictionary <string, double>(); // Where the intermediate results of the query are kept. queryTermFrequency = new Dictionary <string, int>(); // Keeps track of term frequencies queryTermWeight = new Dictionary <string, double>(); // Keeps track of term weights queryVectorLength = 0.0; // The document vector length of the query // The query is broken down into tokens string[] queryTerms = query.Split(new char[] { ' ' }); // Iterates over each query term to compute the term frequency foreach (string qterm in queryTerms) { // It only cares about those words that are in the dictionary if (dict.ContainsTermEntry(qterm)) { if (!queryTermFrequency.ContainsKey(qterm)) { // First time the query word is encountered queryTermFrequency.Add(qterm, 1); } else { // The query word is already there, so the frequency gets increased queryTermFrequency[qterm] += 1; } } } // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms foreach (string qterm in queryTermFrequency.Keys) { // Gets the Term from the dictionary TLTermEntry term = dict.GetTermEntry(qterm); // It computes the weight of a term - IE the frequency TIMES the term's specificity. // Note: the specifity of the term is stored in the weight. // For the basic dictionary this is just 1 // For the tf-idf dictionary this is the idf // For the signal-noise this is the signal double weight = queryTermFrequency[qterm] * term.Weight; queryTermWeight.Add(qterm, weight); // Updates the document vector length of the query queryVectorLength += Math.Pow(weight, 2); // It now iterates over all the postings that have this term foreach (TLPosting posting in term.Postings) { string docId = posting.ArtifactId; // Calculates the product of the query times the posting for this particular term double r = queryTermWeight[qterm] * posting.Weight; if (intermediateResults.ContainsKey(docId)) { intermediateResults[docId] += r; } else { intermediateResults.Add(docId, r); } } } // The document vector lenght for the query is the square root of the sum of the squares of the term weights queryVectorLength = Math.Sqrt(queryVectorLength); // It iterates over the intermediate results to create the final array that is returned to the user foreach (string docId in intermediateResults.Keys) { // Result r = new ResultObj(docId, intermediateResults.get(docId)); double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId)); Result r = new Result(docId, similarity); results.Add(r); } // Sorts the results results.Sort(); return(results); }