public void EmptyTargetArtifactsTest() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); sourceArtifacts.Add(new TLArtifact("id", "text")); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); dictionary.AddTermEntry("term", 1, 1, 1); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); if (simMat == null || simMat.Count != 0) { Assert.Fail("Similarity Matrix should still be created but have nothing in it"); } }
public void TestTracingOfComponent() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); // TODO: add inputs that matter sourceArtifacts.Add(new TLArtifact("id1", "first text")); sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with")); sourceArtifacts.Add(new TLArtifact("id3", "some more text")); targetArtifacts.Add(new TLArtifact("id1", "hello world")); targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed")); targetArtifacts.Add(new TLArtifact("id3", "yep")); targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop")); dictionary.AddTermEntry("term", 3, 3, 0.2); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); // TODO: add tests to make sure the output is correctly formatted Assert.Fail(); }
public void IncorrectInputSourceArtifactsType() { TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); targetArtifacts.Add(new TLArtifact("id1", "text2")); TLDictionaryIndex dictionary = new TLDictionaryIndex(); dictionary.AddTermEntry("term", 1, 1, 1); Workspace.Store("sourceArtifacts", "incorrect type"); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); }
/** * input is a map of artifact.Id and processed text of each artifact * */ public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables TLTermEntry termEntry; TLPosting posting; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary <string, double> documentVectorLength = new Dictionary <string, double>(); // Creates the dictionary TLDictionaryIndex dict = new TLDictionaryIndex(); // Iterates over all the documents foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (terms.Length == 0) { logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id)); } // Iterates over all the terms foreach (string t in terms) { // Checks if that term has already a posting if (!dict.ContainsTermEntry(t)) { // New term termEntry = dict.AddTermEntry(t, 1, 1, 1.0); posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing term termEntry = dict.GetTermEntry(t); termEntry.TotalFrequencyAcrossArtifacts += 1; termEntry.Weight = 1.0; // Checks if there is already a posting for this document if (!termEntry.ContainsPosting(artifact.Id)) { // New posting termEntry.NumberOfArtifactsContainingTerm += 1; posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing posting posting = termEntry.GetPosting(artifact.Id); posting.Frequency += 1; posting.Weight += 1.0; } } } } string artId; // Now that all the counts are in, it calculates the document vector weights foreach (TLTermEntry t in dict.TermEntries) { foreach (TLPosting p in t.Postings) { artId = p.ArtifactId; vectorLength = Math.Pow(p.Frequency, 2); if (documentVectorLength.ContainsKey(artId)) { // The document has other terms vectorLength += documentVectorLength[artId]; } documentVectorLength[artId] = vectorLength; } } // Finally, we need to get the square root of all entries in the document vector length foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { if (documentVectorLength.ContainsKey(artifact.Id)) { vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]); // Here we update the document vector length of the dictionary - not the internal structure any more dict.SetDocumentVectorWeight(artifact.Id, vectorLength); } } return(dict); }