Example #1
0
        public void EmptyDictionaryIndexTest()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();

            sourceArtifacts.Add(new TLArtifact("id", "text"));
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();

            targetArtifacts.Add(new TLArtifact("id", "text"));
            TLDictionaryIndex dictionary = new TLDictionaryIndex();

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");

            if (simMat == null || simMat.Count != 0)
            {
                Assert.Fail("Similarity Matrix should still be created but have nothing in it");
            }
        }
Example #2
0
        public void TestTracingOfComponent()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex     dictionary      = new TLDictionaryIndex();

            // TODO: add inputs that matter
            sourceArtifacts.Add(new TLArtifact("id1", "first text"));
            sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with"));
            sourceArtifacts.Add(new TLArtifact("id3", "some more text"));

            targetArtifacts.Add(new TLArtifact("id1", "hello world"));
            targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed"));
            targetArtifacts.Add(new TLArtifact("id3", "yep"));
            targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop"));

            dictionary.AddTermEntry("term", 3, 3, 0.2);

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");

            // TODO: add tests to make sure the output is correctly formatted
            Assert.Fail();
        }
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts =
                (TLArtifactsCollection)Workspace.Load("listOfArtifacts");

            TLDictionaryIndex dict = BuildDictionary(listOfArtifacts, Logger);

            Workspace.Store("dictionaryIndex", dict);
        }
Example #4
0
        public override void Compute()
        {
            TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("sourceArtifacts");
            TLDictionaryIndex     dict            = (TLDictionaryIndex)Workspace.Load("dictionaryIndex");
            TracerConfig          config          = (TracerConfig)this.Configuration;

            TLSimilarityMatrix similarityMatrix = Process(sourceArtifacts, dict, config);

            Workspace.Store("similarityMatrix", similarityMatrix);
        }
        private static TLDictionaryIndex BuildDictionary(TLArtifactsCollection listOfArtifacts, ComponentLogger logger)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Received null listOfArtifacts");
            }

            TLDictionaryIndex dict = TFIDFIndexBuilder.build(listOfArtifacts, logger);

            return(dict);
        }
Example #6
0
        public void NullSourceArtifactsTest()
        {
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex     dictionary      = new TLDictionaryIndex();

            Workspace.Store("sourceArtifacts", null);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();
        }
Example #7
0
        /// <summary>
        /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths.
        /// </summary>
        /// <param name="sourceArtifacts">The source artifacts.</param>
        /// <param name="targetArtifacts">The target artifacts.</param>
        /// <param name="dict">The dict.</param>
        /// <param name="ancTermsWeights">The anc terms weights.</param>
        /// <param name="config">The config.</param>
        /// <returns>Similarity matrix with links between source and target artifacts</returns>
        private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts,
                                                              TLArtifactsCollection targetArtifacts,
                                                              TLDictionaryIndex dict,
                                                              TLKeyValuePairsList ancTermsWeights,
                                                              TracerConfig config)
        {
            if (sourceArtifacts == null)
            {
                throw new ComponentException("Received source artifacts are null!");
            }

            if (targetArtifacts == null)
            {
                throw new ComponentException("Received target artifacts are null!");
            }

            if (dict == null)
            {
                throw new ComponentException("Received dictionary index is null!");
            }

            if (ancTermsWeights == null)
            {
                throw new ComponentException("Received 'ancTermsWeights' is null!");
            }

            TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix();


            ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric));

            // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search
            foreach (TLArtifact sourceArtifact in sourceArtifacts.Values)
            {
                String query = sourceArtifact.Text;

                // Executes the query
                List <Result> results;
                results = searcher.search(query, dict, PrepareANCData(ancTermsWeights));

                // Iterates over the results and stores them in the matrix
                foreach (Result r in results)
                {
                    string targetArtifactId = r.ArtifactId;
                    similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking);
                }
            }
            return(similarityMatrix);
        }
Example #8
0
        public override void Compute()
        {
            Logger.Trace("Start component ANCTracerComponent");

            TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("sourceArtifacts");
            TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("targetArtifacts");
            TLDictionaryIndex     dict            = (TLDictionaryIndex)Workspace.Load("dictionaryIndex");
            TLKeyValuePairsList   ancTermsWeights = (TLKeyValuePairsList)Workspace.Load("ancTermsWeights");

            TracerConfig config = (TracerConfig)this.Configuration;

            TLSimilarityMatrix similarityMatrix = ComputeTraceability(sourceArtifacts, targetArtifacts, dict, ancTermsWeights, config);

            Workspace.Store("similarityMatrix", similarityMatrix);

            Logger.Trace("Completed component ANCTracerComponent");
        }
Example #9
0
        public void IncorrectInputSourceArtifactsType()
        {
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();

            targetArtifacts.Add(new TLArtifact("id1", "text2"));
            TLDictionaryIndex dictionary = new TLDictionaryIndex();

            dictionary.AddTermEntry("term", 1, 1, 1);

            Workspace.Store("sourceArtifacts", "incorrect type");
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();
        }
Example #10
0
        private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config)
        {
            if (sourceArtifacts == null)
            {
                throw new ComponentException("Received null sourceArtifacts");
            }

            if (dict == null)
            {
                throw new ComponentException("Received null dictionaryIndex");
            }

            TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix();

            Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric));

            // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search
            foreach (TLArtifact sourceArtifact in sourceArtifacts.Values)
            {
                String query = sourceArtifact.Text;

                // Executes the query
                List <Result> results;
                results = searcher.search(query, dict);

                // Iterates over the results and stores them in the matrix
                foreach (Result r in results)
                {
                    string targetArtifactId = r.ArtifactId;
                    similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking);
                }
            }

            return(similarityMatrix);
        }
Example #11
0
        /**
         * input is a map of artifact.Id and processed text of each artifact
         *
         */
        public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
        {
            // Variables
            TLTermEntry termEntry;
            TLPosting   posting;

            double vectorLength;
            // Stores the vector lenght of each document - this is used to normalize the term weights
            // The vector length is the square root of the sum of the squares of all the term weights.
            Dictionary <string, double> documentVectorLength = new Dictionary <string, double>();

            // Creates the dictionary
            TLDictionaryIndex dict = new TLDictionaryIndex();

            // Iterates over all the documents
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (terms.Length == 0)
                {
                    logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id));
                }

                // Iterates over all the terms
                foreach (string t in terms)
                {
                    // Checks if that term has already a posting
                    if (!dict.ContainsTermEntry(t))
                    {
                        // New term
                        termEntry = dict.AddTermEntry(t, 1, 1, 1.0);
                        posting   = termEntry.AddPosting(artifact.Id, 1, 1.0);
                    }
                    else
                    {
                        // Existing term
                        termEntry = dict.GetTermEntry(t);
                        termEntry.TotalFrequencyAcrossArtifacts += 1;
                        termEntry.Weight = 1.0;

                        // Checks if there is already a posting for this document
                        if (!termEntry.ContainsPosting(artifact.Id))
                        {
                            // New posting
                            termEntry.NumberOfArtifactsContainingTerm += 1;
                            posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
                        }
                        else
                        {
                            // Existing posting
                            posting            = termEntry.GetPosting(artifact.Id);
                            posting.Frequency += 1;
                            posting.Weight    += 1.0;
                        }
                    }
                }
            }

            string artId;

            // Now that all the counts are in, it calculates the document vector weights
            foreach (TLTermEntry t in dict.TermEntries)
            {
                foreach (TLPosting p in t.Postings)
                {
                    artId        = p.ArtifactId;
                    vectorLength = Math.Pow(p.Frequency, 2);
                    if (documentVectorLength.ContainsKey(artId))
                    {
                        // The document has other terms
                        vectorLength += documentVectorLength[artId];
                    }
                    documentVectorLength[artId] = vectorLength;
                }
            }

            // Finally, we need to get the square root of all entries in the document vector length
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                if (documentVectorLength.ContainsKey(artifact.Id))
                {
                    vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]);
                    // Here we update the document vector length of the dictionary - not the internal structure any more
                    dict.SetDocumentVectorWeight(artifact.Id, vectorLength);
                }
            }

            return(dict);
        }
Example #12
0
        /**
         * input is a map of artifact.Id and processed text of each artifact
         *
         */
        public static new TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
        {
            // Variables
            int    N = setOfProcessedDocuments.Count;          // Total Number of Documents
            double idf;
            String docId;
            double vectorLength;
            // Stores the vector lenght of each document - this is used to normalize the term weights
            // The vector length is the square root of the sum of the squares of all the term weights.
            Dictionary <string, double> documentVectorLength = new Dictionary <string, double>();

            // It starts off by calling the parent method, which will calculate the basic frequencies
            TLDictionaryIndex dict = BasicIndexBuilder.build(setOfProcessedDocuments, logger);

            // Iterates over all the terms
            foreach (TLTermEntry term in dict.TermEntries)
            {
                // Calculates the idf for each term - and stores this in the weight of the term - for weighing queries later
                idf         = Math.Log10(N / ((double)term.NumberOfArtifactsContainingTerm));
                term.Weight = idf;

                // Iterates over all the postings
                foreach (TLPosting posting in term.Postings)
                {
                    // Multiplies each term weight by the idf
                    double newWeight = posting.Frequency * idf;
                    posting.Weight = newWeight;

                    // Updates the document vector length
                    docId = posting.ArtifactId;

                    vectorLength = Math.Pow(newWeight, 2);
                    if (documentVectorLength.ContainsKey(docId))
                    {
                        // The document has other terms
                        vectorLength += documentVectorLength[docId];
                    }
                    documentVectorLength[docId] = vectorLength;
                }
            }

            // Now, we need to get the square root of all entries in the document vector length
            foreach (TLArtifact d in setOfProcessedDocuments.Values)
            {
                docId = d.Id;
                if (documentVectorLength.ContainsKey(docId))
                {
                    vectorLength = Math.Sqrt(documentVectorLength[docId]);
                    documentVectorLength[docId] = vectorLength;
                    // Here we update the document vector length of the dictionary - not the internal structure anymore
                    dict.SetDocumentVectorWeight(docId, vectorLength);
                }
            }

            // Lastly, we normalize all the term weights
            foreach (TLTermEntry term in dict.TermEntries)
            {
                foreach (TLPosting posting in term.Postings)
                {
                    vectorLength   = documentVectorLength[posting.ArtifactId];
                    posting.Weight = (posting.Weight / vectorLength);
                }
            }

            return(dict);
        }
Example #13
0
        public List <Result> search(String query, TLDictionaryIndex dict)
        {
            // Variables
            List <Result> results;
            Dictionary <string, double> intermediateResults;
            Dictionary <string, int>    queryTermFrequency;
            Dictionary <string, double> queryTermWeight;
            double queryVectorLength;

            // Initializes the data structures
            results             = new List <Result>();                  // Result array
            intermediateResults = new Dictionary <string, double>();    // Where the intermediate results of the query are kept.
            queryTermFrequency  = new Dictionary <string, int>();       // Keeps track of term frequencies
            queryTermWeight     = new Dictionary <string, double>();    // Keeps track of term weights
            queryVectorLength   = 0.0;                                  // The document vector length of the query

            // The query is broken down into tokens
            string[] queryTerms = query.Split(new char[] { ' ' });

            // Iterates over each query term to compute the term frequency
            foreach (string qterm in queryTerms)
            {
                // It only cares about those words that are in the dictionary
                if (dict.ContainsTermEntry(qterm))
                {
                    if (!queryTermFrequency.ContainsKey(qterm))
                    {
                        // First time the query word is encountered
                        queryTermFrequency.Add(qterm, 1);
                    }
                    else
                    {
                        // The query word is already there, so the frequency gets increased
                        queryTermFrequency[qterm] += 1;
                    }
                }
            }

            // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms
            foreach (string qterm in queryTermFrequency.Keys)
            {
                // Gets the Term from the dictionary
                TLTermEntry term = dict.GetTermEntry(qterm);

                // It computes the weight of a term -  IE the frequency TIMES the term's specificity.
                // Note: the specifity of the term is stored in the weight.
                //      For the basic dictionary this is just 1
                //		For the tf-idf dictionary this is the idf
                //      For the signal-noise this is the signal
                double weight = queryTermFrequency[qterm] * term.Weight;
                queryTermWeight.Add(qterm, weight);

                // Updates the document vector length of the query
                queryVectorLength += Math.Pow(weight, 2);

                // It now iterates over all the postings that have this term
                foreach (TLPosting posting in term.Postings)
                {
                    string docId = posting.ArtifactId;

                    // Calculates the product of the query times the posting for this particular term
                    double r = queryTermWeight[qterm] * posting.Weight;
                    if (intermediateResults.ContainsKey(docId))
                    {
                        intermediateResults[docId] += r;
                    }
                    else
                    {
                        intermediateResults.Add(docId, r);
                    }
                }
            }

            // The document vector lenght for the query is the square root of the sum of the squares of the term weights
            queryVectorLength = Math.Sqrt(queryVectorLength);


            // It iterates over the intermediate results to create the final array that is returned to the user
            foreach (string docId in intermediateResults.Keys)
            {
                // Result r = new ResultObj(docId, intermediateResults.get(docId));
                double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId));
                Result r          = new Result(docId, similarity);
                results.Add(r);
            }

            // Sorts the results
            results.Sort();
            return(results);
        }