Ejemplo n.º 1
0
        public override void Compute()
        {
            DocumentVectorCollection documents = (DocumentVectorCollection)Workspace.Load("DocumentVectors");

            Models.Normalizer norm = new Models.Normalizer(documents);
            Workspace.Store("NormalizedVectors", norm.Vectors);
            Workspace.Store("NormalizedVectorLengths", norm.Lengths);
        }
Ejemplo n.º 2
0
        public override void Compute()
        {
            DocumentVectorCollection   queries = (DocumentVectorCollection)Workspace.Load("Queries");
            NormalizedVector           lengths = (NormalizedVector)Workspace.Load("DocumentLengths");
            NormalizedVectorCollection docs    = (NormalizedVectorCollection)Workspace.Load("Documents");
            TLSimilarityMatrix         sims    = Models.CosineSimilarity.Compute(docs, lengths, queries);

            Workspace.Store("SimilarityMatrix", sims);
        }
Ejemplo n.º 3
0
        public Normalizer(DocumentVectorCollection documents)
        {
            lengths = new NormalizedVector("DocumentVectorLengths");
            vectors = new NormalizedVectorCollection();

            foreach (KeyValuePair <string, DocumentVector> kvp in documents)
            {
                String           id  = kvp.Key;
                NormalizedVector vec = Normalize(id, kvp.Value);
                vectors.Add(vec);
            }
        }
Ejemplo n.º 4
0
        public Vectorizer(TLArtifactsCollection artifacts, String representation)
        {
            vectors = new DocumentVectorCollection();
            freq = new DocumentVector("DocumentFrequencies");

            foreach (KeyValuePair<string, TLArtifact> kvp in artifacts)
            {
                // vars
                String docID = kvp.Value.Id;
                String[] words = kvp.Value.Text.Split(' ');

                // create new document representation
                DocumentVector vec = new DocumentVector(docID);
                List<String> addedWords = new List<String>();

                // loop over each word and update its frequency
                foreach (String word in words)
                {
                    // update term-doc frequency only ONCE per document
                    if (!freq.ContainsKey(word))
                    {
                        freq.Add(word, 1);
                        addedWords.Add(word);
                    }
                    else if (!addedWords.Contains(word))
                    {
                        freq[word]++;
                        addedWords.Add(word);
                    }

                    // update word freqency
                    if (!vec.ContainsKey(word))
                    {
                        vec.Add(word, 1);
                    }
                    else
                    {
                        if (representation == "Ordinal")
                        {
                            vec[word]++;
                        }
                    }
                    // update MaxFreq
                    if (vec[word] > vec.MaxFreq.Value)
                    {
                        vec.MaxFreq = new KeyValuePair<string, int>(word, vec[word]);
                    }
                }

                // add document to vector collection
                vectors.Add(vec);
            }
        }
Ejemplo n.º 5
0
        public Vectorizer(TLArtifactsCollection artifacts, String representation)
        {
            vectors = new DocumentVectorCollection();
            freq    = new DocumentVector("DocumentFrequencies");

            foreach (KeyValuePair <string, TLArtifact> kvp in artifacts)
            {
                // vars
                String   docID = kvp.Value.Id;
                String[] words = kvp.Value.Text.Split(' ');

                // create new document representation
                DocumentVector vec        = new DocumentVector(docID);
                List <String>  addedWords = new List <String>();

                // loop over each word and update its frequency
                foreach (String word in words)
                {
                    // update term-doc frequency only ONCE per document
                    if (!freq.ContainsKey(word))
                    {
                        freq.Add(word, 1);
                        addedWords.Add(word);
                    }
                    else if (!addedWords.Contains(word))
                    {
                        freq[word]++;
                        addedWords.Add(word);
                    }

                    // update word freqency
                    if (!vec.ContainsKey(word))
                    {
                        vec.Add(word, 1);
                    }
                    else
                    {
                        if (representation == "Ordinal")
                        {
                            vec[word]++;
                        }
                    }
                    // update MaxFreq
                    if (vec[word] > vec.MaxFreq.Value)
                    {
                        vec.MaxFreq = new KeyValuePair <string, int>(word, vec[word]);
                    }
                }

                // add document to vector collection
                vectors.Add(vec);
            }
        }
Ejemplo n.º 6
0
        public Normalizer(DocumentVectorCollection documents)
        {
            lengths = new NormalizedVector("DocumentVectorLengths");
            vectors = new NormalizedVectorCollection();

            foreach (KeyValuePair<string, DocumentVector> kvp in documents)
            {
                String id = kvp.Key;
                NormalizedVector vec = Normalize(id, kvp.Value);
                vectors.Add(vec);
            }
        }
        public static TLSimilarityMatrix Compute(NormalizedVectorCollection docs, NormalizedVector lengths, DocumentVectorCollection queries)
        {
            TLSimilarityMatrix sims = new TLSimilarityMatrix();

            foreach (KeyValuePair<string, DocumentVector> QueryKVP in queries)
            {
                /*
                 * Since tf in queries are all 1,
                 * we can assume this term is the sqrt of the size of the dictionary
                 */
                double qVal = Math.Sqrt(QueryKVP.Value.Count);
                foreach (KeyValuePair<string, NormalizedVector> DocKVP in docs)
                {
                    double dVal = lengths[DocKVP.Key];
                    double qdVec = ComputeProduct(QueryKVP.Value, DocKVP.Value);
                    sims.AddLink(QueryKVP.Key, DocKVP.Key, qdVec / (qVal * dVal));
                }
            }

            return sims;
        }
Ejemplo n.º 8
0
        public static TLSimilarityMatrix Compute(NormalizedVectorCollection docs, NormalizedVector lengths, DocumentVectorCollection queries)
        {
            TLSimilarityMatrix sims = new TLSimilarityMatrix();

            foreach (KeyValuePair <string, DocumentVector> QueryKVP in queries)
            {
                /*
                 * Since tf in queries are all 1,
                 * we can assume this term is the sqrt of the size of the dictionary
                 */
                double qVal = Math.Sqrt(QueryKVP.Value.Count);
                foreach (KeyValuePair <string, NormalizedVector> DocKVP in docs)
                {
                    double dVal  = lengths[DocKVP.Key];
                    double qdVec = ComputeProduct(QueryKVP.Value, DocKVP.Value);
                    sims.AddLink(QueryKVP.Key, DocKVP.Key, qdVec / (qVal * dVal));
                }
            }

            return(sims);
        }