public List<KeyValuePair<string, double>> RankedResults() { List<Document> documents = new List<Document>(); HashSet<string> dataSet = new HashSet<string>(); foreach (var result in results) { Document d = new Document(result); documents.Add(d); foreach (var term in d.tokens()){ dataSet.Add(term); } } //Build Document Vectors Dictionary<string, Vector> documentVectors = new Dictionary<string, Vector>(); foreach (var document in documents) { documentVectors.Add(document.ToString(), new Vector(dataSet, document)); } //Build Query Vector Query query = new Query(queryString); Vector queryVector = new Vector(dataSet, query); Dictionary<string, double> relevance = new Dictionary<string, double>(); foreach (var documentVector in documentVectors) { relevance.Add(documentVector.Key, Vector.GetSimilarityScore(queryVector, documentVector.Value)); } //Sort result by most relevant List<KeyValuePair<string, double>> myList = relevance.ToList(); return myList; }
public static void add(Document document) { numOfDocuments += 1; List<string> terms = document.tokens(); for (int i = 0; i < terms.Count; i++) { //term is already in index if (index.ContainsKey(terms[i])) { Dictionary<string, List<int>> temp = index[terms[i]]; //term already exists in document if (temp.ContainsKey(document.ToString())) { tfIndex[terms[i]][document.ToString()] += 1;//update tfIndex temp[document.ToString()].Add(i); index[terms[i]] = temp; } //first occurence in a document of term that already exists else { dfIndex[terms[i]] += 1; tfIndex[terms[i]][document.ToString()] = 1; List<int> positionsList = new List<int>(); positionsList.Add(i); temp[document.ToString()] = positionsList; } } //new term entry else { Dictionary<string, List<int>> temp = new Dictionary<string, List<int>>(); List<int> positionsList = new List<int>(); positionsList.Add(i); temp[document.ToString()] = positionsList; index[terms[i]] = temp; Dictionary<string, int> tf = new Dictionary<string, int>(); tf[document.ToString()] = 1; tfIndex[terms[i]] = tf; dfIndex[terms[i]] = 1; } } serialize(); }
static void Main(string[] args) { Document a = new Document("C:\\Users\\LOLU\\Documents\\csc322\\doc1", "txt"); Document b = new Document("C:\\Users\\LOLU\\Documents\\csc322\\doc2", "txt"); Document c = new Document("C:\\Users\\LOLU\\Books~Tutorials\\OSS2014.pdf"); Document d = new Document("C:\\Users\\LOLU\\Books~Tutorials\\codility lessons\\1-TimeComplexity.pdf"); Document e = new Document("C:\\Users\\LOLU\\Books~Tutorials\\codility lessons\\2-CountingElements.pdf"); Document f = new Document("C:\\Users\\LOLU\\Books~Tutorials\\codility lessons\\3-PrefixSums.pdf"); Document g = new Document("C:\\Users\\LOLU\\Documents\\csc322\\test", "html"); InvertedIndex.add(a); InvertedIndex.add(b); InvertedIndex.add(c); /* Query query = new Query("open source information"); Console.WriteLine(query.QueryType()); Console.WriteLine(query.tokens().Count); foreach(var item in query.RankedResults()){ Console.WriteLine(item); }*/ }
public Vector(HashSet<string> dataSet, Document document) { this.dataSet = dataSet; vectorRep = document.GetVector(dataSet); }