private static List <int> DocumentsOccuredInQuery(WordsVector wordsVector, Vector queryVector) { List <int> documents = new List <int>(); foreach (var pair in queryVector.Dictionary) { var key = pair.Key; if (wordsVector.GetVector().ContainsKey(key)) { var newDocuments = wordsVector.GetVector()[key].GetDocuments(); documents = AddDocuments(documents, newDocuments); } } return(documents); }
public static IList <int> Parse(string query, HashSet <int> universalSet, WordsVector wordsVector) { var PostfixQuery = InfixToPostfixConvertor.Convert(query); Stack <HashSet <int> > stack = new Stack <HashSet <int> >(); foreach (var item in PostfixQuery) { if (item == "!") { var operand = stack.Pop(); var remaining = new HashSet <int>(universalSet); remaining.ExceptWith(operand); stack.Push(remaining); } else if (item == "&") { var operand1 = stack.Pop(); var operand2 = stack.Pop(); operand1.IntersectWith(operand2); stack.Push(operand1); } else if (item == "|") { var operand1 = stack.Pop(); var operand2 = stack.Pop(); operand1.UnionWith(operand2); stack.Push(operand1); } else { try { var documents = wordsVector.GetVector()[item].GetDocuments(); stack.Push(documents); } catch { stack.Push(new HashSet <int>()); } } } var answer = stack.Pop().ToList(); answer.Sort(); return(answer); }
public static IList <int> Parse(string query, WordsVector wordsVector, DocumentsVector documentsVector) { var query_tfidf = ComputeQueryTfIdfVector(query, wordsVector, documentsVector.DocumentsIndex().Count); var documentsIndex = DocumentsOccuredInQuery(wordsVector, query_tfidf); var similarities = new Dictionary <int, double>(); foreach (var index in documentsIndex) { var similarity = ComputeCosineSimilarity( documentsVector.GetVectorOfDocumentIndex(index), query_tfidf); similarities.Add(index, similarity); } return(SortTheDictionaryOnValue(similarities)); }
private static Vector ComputeQueryTfIdfVector(string query, WordsVector wordsVector, int totalDocuments) { Vector vector = new Vector(query); foreach (var pair in vector.Dictionary) { var key = pair.Key; double inverseDocumentFrequency = 0; if (wordsVector.GetVector().ContainsKey(key)) { var documentFrequency = wordsVector.GetVector()[key].DocumentCount; inverseDocumentFrequency = Math.Log10(documentFrequency) / totalDocuments; } vector.Dictionary[key].Count *= inverseDocumentFrequency; } return(vector); }
public static IList <int> Parse(string query, WordsVector wordsVector) { var ProximityQueryRegex = new Regex(@"^(\w+)\s(\w+)\s*/\s*(\d+)$"); var groups = ProximityQueryRegex.Match(query).Groups; var operand1 = groups[1].Value; var operand2 = groups[2].Value; var displacement = Convert.ToInt32(groups[3].Value); List <int> answer = new List <int>(); try { var document1 = wordsVector.GetVector()[operand1]; var document2 = wordsVector.GetVector()[operand2]; var commonDocuments = document1.GetDocuments(); commonDocuments.IntersectWith(document2.GetDocuments()); foreach (int doc in commonDocuments) { var list1 = document1.DocumentsDictionary[doc].Positions; var list2 = document2.DocumentsDictionary[doc].Positions; foreach (var position in list1) { if (list2.Contains(position + displacement) || list2.Contains(position - displacement)) { answer.Add(doc); break; } } } answer.Sort(); return(answer); } catch { return(new List <int>()); } }
public IList <int> Parse(string _query, WordsVector wordsVector, DocumentsVector documentsVector, HashSet <int> universalSet) { var query = Preprocessor.GetInstance().PreprocessQuery(_query); if (ProximityQueryRegex.IsMatch(query)) { return(ProximityQuery.Parse(query, wordsVector)); } else if (GeneralTextQueryRegex.IsMatch(query)) { return(GeneralTextQueryParser.Parse(query, wordsVector, documentsVector)); } else if (BooleanQueryRegex.IsMatch(query)) { return(BooleanQueryParser.Parse(query, universalSet, wordsVector)); } else { throw new Exception("Invalid Query"); } }
public void test() { string[] sentences = { "Articles Articles the English.", "Articles in the Arabic language.", "I love Playing cricket." }; var expected = new WordsVector(); for (int i = 0; i < 3; i++) { var vector = new Vector(sentences[i]); expected.Update(i, vector); } Dictionary <string, WordDocuments> WV_dictionary = new Dictionary <string, WordDocuments>(); var index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("Articles"); index.AddOccurrence(0); index.AddOccurrence(1); var wordDocument = new WordDocuments(0, index); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("Articles"); index.AddOccurrence(0); wordDocument.Update(1, index); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("Articles"), wordDocument); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("English"); index.AddOccurrence(3); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("English"), new WordDocuments(0, index)); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("Arabic"); index.AddOccurrence(3); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("Arabic"), new WordDocuments(1, index)); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("language"); index.AddOccurrence(4); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("language"), new WordDocuments(1, index)); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("love"); index.AddOccurrence(1); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("love"), new WordDocuments(2, index)); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("Playing"); index.AddOccurrence(2); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("Playing"), new WordDocuments(2, index)); index = new Index(); index.Word = Preprocessor.GetInstance().Preprocess("cricket"); index.AddOccurrence(3); WV_dictionary.Add(Preprocessor.GetInstance().Preprocess("cricket"), new WordDocuments(2, index)); var actual = new WordsVector(WV_dictionary); Assert.AreEqual(expected, actual); }
private DataStorage() { FYP_Data = new Dictionary <int, FYPSearchModel>(); wordsVector = new WordsVector(); documentsVector = new DocumentsVector(); }