Beispiel #1
0
        public void EmptyTargetArtifactsTest()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();

            sourceArtifacts.Add(new TLArtifact("id", "text"));
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex     dictionary      = new TLDictionaryIndex();

            dictionary.AddTermEntry("term", 1, 1, 1);

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");

            if (simMat == null || simMat.Count != 0)
            {
                Assert.Fail("Similarity Matrix should still be created but have nothing in it");
            }
        }
Beispiel #2
0
        public void TestTracingOfComponent()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex     dictionary      = new TLDictionaryIndex();

            // TODO: add inputs that matter
            sourceArtifacts.Add(new TLArtifact("id1", "first text"));
            sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with"));
            sourceArtifacts.Add(new TLArtifact("id3", "some more text"));

            targetArtifacts.Add(new TLArtifact("id1", "hello world"));
            targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed"));
            targetArtifacts.Add(new TLArtifact("id3", "yep"));
            targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop"));

            dictionary.AddTermEntry("term", 3, 3, 0.2);

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");

            // TODO: add tests to make sure the output is correctly formatted
            Assert.Fail();
        }
Beispiel #3
0
        public void IncorrectInputSourceArtifactsType()
        {
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();

            targetArtifacts.Add(new TLArtifact("id1", "text2"));
            TLDictionaryIndex dictionary = new TLDictionaryIndex();

            dictionary.AddTermEntry("term", 1, 1, 1);

            Workspace.Store("sourceArtifacts", "incorrect type");
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();
        }
Beispiel #4
0
        /**
         * input is a map of artifact.Id and processed text of each artifact
         *
         */
        public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
        {
            // Variables
            TLTermEntry termEntry;
            TLPosting   posting;

            double vectorLength;
            // Stores the vector lenght of each document - this is used to normalize the term weights
            // The vector length is the square root of the sum of the squares of all the term weights.
            Dictionary <string, double> documentVectorLength = new Dictionary <string, double>();

            // Creates the dictionary
            TLDictionaryIndex dict = new TLDictionaryIndex();

            // Iterates over all the documents
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (terms.Length == 0)
                {
                    logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id));
                }

                // Iterates over all the terms
                foreach (string t in terms)
                {
                    // Checks if that term has already a posting
                    if (!dict.ContainsTermEntry(t))
                    {
                        // New term
                        termEntry = dict.AddTermEntry(t, 1, 1, 1.0);
                        posting   = termEntry.AddPosting(artifact.Id, 1, 1.0);
                    }
                    else
                    {
                        // Existing term
                        termEntry = dict.GetTermEntry(t);
                        termEntry.TotalFrequencyAcrossArtifacts += 1;
                        termEntry.Weight = 1.0;

                        // Checks if there is already a posting for this document
                        if (!termEntry.ContainsPosting(artifact.Id))
                        {
                            // New posting
                            termEntry.NumberOfArtifactsContainingTerm += 1;
                            posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
                        }
                        else
                        {
                            // Existing posting
                            posting            = termEntry.GetPosting(artifact.Id);
                            posting.Frequency += 1;
                            posting.Weight    += 1.0;
                        }
                    }
                }
            }

            string artId;

            // Now that all the counts are in, it calculates the document vector weights
            foreach (TLTermEntry t in dict.TermEntries)
            {
                foreach (TLPosting p in t.Postings)
                {
                    artId        = p.ArtifactId;
                    vectorLength = Math.Pow(p.Frequency, 2);
                    if (documentVectorLength.ContainsKey(artId))
                    {
                        // The document has other terms
                        vectorLength += documentVectorLength[artId];
                    }
                    documentVectorLength[artId] = vectorLength;
                }
            }

            // Finally, we need to get the square root of all entries in the document vector length
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                if (documentVectorLength.ContainsKey(artifact.Id))
                {
                    vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]);
                    // Here we update the document vector length of the dictionary - not the internal structure any more
                    dict.SetDocumentVectorWeight(artifact.Id, vectorLength);
                }
            }

            return(dict);
        }