public override void Compute() { DocumentVectorCollection documents = (DocumentVectorCollection)Workspace.Load("DocumentVectors"); Models.Normalizer norm = new Models.Normalizer(documents); Workspace.Store("NormalizedVectors", norm.Vectors); Workspace.Store("NormalizedVectorLengths", norm.Lengths); }
public override void Compute() { DocumentVectorCollection queries = (DocumentVectorCollection)Workspace.Load("Queries"); NormalizedVector lengths = (NormalizedVector)Workspace.Load("DocumentLengths"); NormalizedVectorCollection docs = (NormalizedVectorCollection)Workspace.Load("Documents"); TLSimilarityMatrix sims = Models.CosineSimilarity.Compute(docs, lengths, queries); Workspace.Store("SimilarityMatrix", sims); }
public Normalizer(DocumentVectorCollection documents) { lengths = new NormalizedVector("DocumentVectorLengths"); vectors = new NormalizedVectorCollection(); foreach (KeyValuePair <string, DocumentVector> kvp in documents) { String id = kvp.Key; NormalizedVector vec = Normalize(id, kvp.Value); vectors.Add(vec); } }
public Vectorizer(TLArtifactsCollection artifacts, String representation) { vectors = new DocumentVectorCollection(); freq = new DocumentVector("DocumentFrequencies"); foreach (KeyValuePair<string, TLArtifact> kvp in artifacts) { // vars String docID = kvp.Value.Id; String[] words = kvp.Value.Text.Split(' '); // create new document representation DocumentVector vec = new DocumentVector(docID); List<String> addedWords = new List<String>(); // loop over each word and update its frequency foreach (String word in words) { // update term-doc frequency only ONCE per document if (!freq.ContainsKey(word)) { freq.Add(word, 1); addedWords.Add(word); } else if (!addedWords.Contains(word)) { freq[word]++; addedWords.Add(word); } // update word freqency if (!vec.ContainsKey(word)) { vec.Add(word, 1); } else { if (representation == "Ordinal") { vec[word]++; } } // update MaxFreq if (vec[word] > vec.MaxFreq.Value) { vec.MaxFreq = new KeyValuePair<string, int>(word, vec[word]); } } // add document to vector collection vectors.Add(vec); } }
public Vectorizer(TLArtifactsCollection artifacts, String representation) { vectors = new DocumentVectorCollection(); freq = new DocumentVector("DocumentFrequencies"); foreach (KeyValuePair <string, TLArtifact> kvp in artifacts) { // vars String docID = kvp.Value.Id; String[] words = kvp.Value.Text.Split(' '); // create new document representation DocumentVector vec = new DocumentVector(docID); List <String> addedWords = new List <String>(); // loop over each word and update its frequency foreach (String word in words) { // update term-doc frequency only ONCE per document if (!freq.ContainsKey(word)) { freq.Add(word, 1); addedWords.Add(word); } else if (!addedWords.Contains(word)) { freq[word]++; addedWords.Add(word); } // update word freqency if (!vec.ContainsKey(word)) { vec.Add(word, 1); } else { if (representation == "Ordinal") { vec[word]++; } } // update MaxFreq if (vec[word] > vec.MaxFreq.Value) { vec.MaxFreq = new KeyValuePair <string, int>(word, vec[word]); } } // add document to vector collection vectors.Add(vec); } }
public Normalizer(DocumentVectorCollection documents) { lengths = new NormalizedVector("DocumentVectorLengths"); vectors = new NormalizedVectorCollection(); foreach (KeyValuePair<string, DocumentVector> kvp in documents) { String id = kvp.Key; NormalizedVector vec = Normalize(id, kvp.Value); vectors.Add(vec); } }
public static TLSimilarityMatrix Compute(NormalizedVectorCollection docs, NormalizedVector lengths, DocumentVectorCollection queries) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (KeyValuePair<string, DocumentVector> QueryKVP in queries) { /* * Since tf in queries are all 1, * we can assume this term is the sqrt of the size of the dictionary */ double qVal = Math.Sqrt(QueryKVP.Value.Count); foreach (KeyValuePair<string, NormalizedVector> DocKVP in docs) { double dVal = lengths[DocKVP.Key]; double qdVec = ComputeProduct(QueryKVP.Value, DocKVP.Value); sims.AddLink(QueryKVP.Key, DocKVP.Key, qdVec / (qVal * dVal)); } } return sims; }
public static TLSimilarityMatrix Compute(NormalizedVectorCollection docs, NormalizedVector lengths, DocumentVectorCollection queries) { TLSimilarityMatrix sims = new TLSimilarityMatrix(); foreach (KeyValuePair <string, DocumentVector> QueryKVP in queries) { /* * Since tf in queries are all 1, * we can assume this term is the sqrt of the size of the dictionary */ double qVal = Math.Sqrt(QueryKVP.Value.Count); foreach (KeyValuePair <string, NormalizedVector> DocKVP in docs) { double dVal = lengths[DocKVP.Key]; double qdVec = ComputeProduct(QueryKVP.Value, DocKVP.Value); sims.AddLink(QueryKVP.Key, DocKVP.Key, qdVec / (qVal * dVal)); } } return(sims); }