/// <summary>
        /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths.
        /// </summary>
        /// <param name="sourceArtifacts">The source artifacts.</param>
        /// <param name="targetArtifacts">The target artifacts.</param>
        /// <param name="dict">The dict.</param>
        /// <param name="ancTermsWeights">The anc terms weights.</param>
        /// <param name="config">The config.</param>
        /// <returns>Similarity matrix with links between source and target artifacts</returns>
        private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts, 
                                                              TLArtifactsCollection targetArtifacts, 
                                                              TLDictionaryIndex dict, 
                                                              TLKeyValuePairsList ancTermsWeights, 
                                                              TracerConfig config)
        {
            if (sourceArtifacts == null)
            {
                throw new ComponentException("Received source artifacts are null!");
            }

            if (targetArtifacts == null)
            {
                throw new ComponentException("Received target artifacts are null!");
            }

            if (dict == null)
            {
                throw new ComponentException("Received dictionary index is null!");
            }

            if (ancTermsWeights == null)
            {
                throw new ComponentException("Received 'ancTermsWeights' is null!");
            }

            TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix();

            
            ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric));

            // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search
            foreach (TLArtifact sourceArtifact in sourceArtifacts.Values)
            {

                String query = sourceArtifact.Text;

                // Executes the query
                List<Result> results;
                results = searcher.search(query, dict, PrepareANCData(ancTermsWeights));

                // Iterates over the results and stores them in the matrix
                foreach (Result r in results)
                {
                    string targetArtifactId = r.ArtifactId;
                    similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking);
                }
            }
            return similarityMatrix;
        }
        public void EmptyTargetArtifactsTest()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();
            sourceArtifacts.Add(new TLArtifact("id", "text"));
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex dictionary = new TLDictionaryIndex();
            dictionary.AddTermEntry("term", 1, 1, 1);

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");
            if (simMat == null || simMat.Count != 0)
            {
                Assert.Fail("Similarity Matrix should still be created but have nothing in it");
            }
        }
        private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config)
        {
            if (sourceArtifacts == null)
            {
                throw new ComponentException("Received null sourceArtifacts");
            }

            if (dict == null)
            {
                throw new ComponentException("Received null dictionaryIndex");
            }

            TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix();

            Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric));

            // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search
            foreach (TLArtifact sourceArtifact in sourceArtifacts.Values)
            {

                String query = sourceArtifact.Text;

                // Executes the query
                List<Result> results;
                results = searcher.search(query, dict);

                // Iterates over the results and stores them in the matrix
                foreach (Result r in results)
                {
                    string targetArtifactId = r.ArtifactId;
                    similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking);
                }
            }

            return similarityMatrix;
        }
        public void IncorrectInputSourceArtifactsType()
        {
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            targetArtifacts.Add(new TLArtifact("id1", "text2"));
            TLDictionaryIndex dictionary = new TLDictionaryIndex();
            dictionary.AddTermEntry("term", 1,1,1);

            Workspace.Store("sourceArtifacts", "incorrect type");
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();
        }
        public void NullSourceArtifactsTest()
        {
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex dictionary = new TLDictionaryIndex();

            Workspace.Store("sourceArtifacts", null);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();
        }
        public void TestTracingOfComponent()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex dictionary = new TLDictionaryIndex();

            // TODO: add inputs that matter
            sourceArtifacts.Add(new TLArtifact("id1", "first text"));
            sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with"));
            sourceArtifacts.Add(new TLArtifact("id3", "some more text"));

            targetArtifacts.Add(new TLArtifact("id1", "hello world"));
            targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed"));
            targetArtifacts.Add(new TLArtifact("id3", "yep"));
            targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop"));

            dictionary.AddTermEntry("term", 3, 3, 0.2);

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");
            // TODO: add tests to make sure the output is correctly formatted
            Assert.Fail();
        }
예제 #7
0
        public List<Result> search(String query, TLDictionaryIndex dict, Dictionary<string, double> ancTermsWeights)
        {
		   
            // Variables
		    List<Result> results;
		    Dictionary<string, double> intermediateResults;
		    Dictionary<string, int> queryTermFrequency;
		    Dictionary<string, double> queryTermWeight;
		    double queryVectorLength;
		
		    // Initializes the data structures
		    results = new List<Result>();						// Result array
		    intermediateResults = new Dictionary<string, double>();	// Where the intermediate results of the query are kept.
		    queryTermFrequency = new Dictionary<string, int>();	// Keeps track of term frequencies
		    queryTermWeight = new Dictionary<string, double>();		// Keeps track of term weights
		    queryVectorLength = 0.0;								// The document vector length of the query
		
		    // The query is broken down into tokens
            string[] queryTerms = query.Split(new char[] { ' ' } );
		
		    // Iterates over each query term to compute the term frequency 
		    foreach (string qterm in queryTerms) {

			    // It only cares about those words that are in the dictionary
			    if (dict.ContainsTermEntry(qterm)) {
				    if (!queryTermFrequency.ContainsKey(qterm)) {
					    // First time the query word is encountered
					    queryTermFrequency.Add(qterm, 1);
				    } else {
					    // The query word is already there, so the frequency gets increased
					    queryTermFrequency[qterm] += 1;
				    }
			    }
		    }
		
		    // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms
		    foreach (string qterm in queryTermFrequency.Keys) {

			    // Gets the Term from the dictionary
			    TLTermEntry term = dict.GetTermEntry(qterm);
			
			    // It computes the weight of a term -  IE the frequency TIMES the term's specificity.
			    // Note: the specifity of the term is stored in the weight.
			    // 		For the basic dictionary this is just 1
			    //		For the tf-idf dictionary this is the idf
			    // 		For the signal-noise this is the signal
			    
                //double weight = queryTermFrequency[qterm] * term.Weight;
                double ancWeight;
                if(ancTermsWeights.TryGetValue(qterm, out ancWeight) == false) ancWeight = 1.0;
                

                double weight = queryTermFrequency[qterm] * ancWeight;
			    queryTermWeight.Add(qterm, weight);
			
			    // Updates the document vector length of the query
			    queryVectorLength += Math.Pow(weight, 2);

			    // It now iterates over all the postings that have this term
			    foreach (TLPosting posting in term.Postings) {
				    
                    string docId = posting.ArtifactId;
		
				    // Calculates the product of the query times the posting for this particular term
				    double r = queryTermWeight[qterm] * posting.Weight;
				    if (intermediateResults.ContainsKey(docId)) {
					    intermediateResults[docId] += r;
				    } else {
				        intermediateResults.Add(docId, r);
                    }
			    }
		    }

		    // The document vector lenght for the query is the square root of the sum of the squares of the term weights
		    queryVectorLength = Math.Sqrt(queryVectorLength);
		
		
		    // It iterates over the intermediate results to create the final array that is returned to the user
		    foreach (string docId in intermediateResults.Keys) {
			    // Result r = new ResultObj(docId, intermediateResults.get(docId));
			    double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId));
			    Result r = new Result(docId, similarity);
			    results.Add(r);
		    }
		
		    // Sorts the results
		    results.Sort();
		    return results;
	    }
		/**
		 * input is a map of artifact.Id and processed text of each artifact
		 * 
		 */
		public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
		{
			// Variables
			TLTermEntry termEntry;
			TLPosting posting;
		   
			double vectorLength;
			// Stores the vector lenght of each document - this is used to normalize the term weights
			// The vector length is the square root of the sum of the squares of all the term weights.
			Dictionary<string, double> documentVectorLength = new Dictionary<string, double>();	
		
			// Creates the dictionary
			TLDictionaryIndex dict = new TLDictionaryIndex(); 
		
			// Iterates over all the documents
			foreach (TLArtifact artifact in setOfProcessedDocuments.Values)  
			{

				string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
			
				if (terms.Length == 0)
				{
					logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id));
				}

				// Iterates over all the terms
				foreach (string t in terms) {

					// Checks if that term has already a posting
					if(!dict.ContainsTermEntry(t)) {
						// New term
						termEntry = dict.AddTermEntry(t, 1, 1, 1.0);
						posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
					} else {
						// Existing term
						termEntry = dict.GetTermEntry(t);
						termEntry.TotalFrequencyAcrossArtifacts += 1;
						termEntry.Weight = 1.0;
					
						// Checks if there is already a posting for this document
						if (!termEntry.ContainsPosting(artifact.Id)) {
							// New posting
							termEntry.NumberOfArtifactsContainingTerm += 1;
							posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
						
						} else {
							// Existing posting
							posting = termEntry.GetPosting(artifact.Id);
							posting.Frequency += 1;
							posting.Weight += 1.0;
						}
					}
				}
			}

			string artId;
			// Now that all the counts are in, it calculates the document vector weights
			foreach (TLTermEntry t in dict.TermEntries) {
				foreach (TLPosting p in t.Postings) {
					artId = p.ArtifactId;
					vectorLength = Math.Pow(p.Frequency, 2);
					if (documentVectorLength.ContainsKey(artId)) {
						// The document has other terms
						vectorLength += documentVectorLength[artId];
					}
					documentVectorLength[artId] = vectorLength;
				}
			}
			
			// Finally, we need to get the square root of all entries in the document vector length
			foreach (TLArtifact artifact in setOfProcessedDocuments.Values)  
			{
				if (documentVectorLength.ContainsKey(artifact.Id))
				{
					vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]);
					// Here we update the document vector length of the dictionary - not the internal structure any more
					dict.SetDocumentVectorWeight(artifact.Id, vectorLength);
				}		
			}		
		
			return dict;
		}