private List <KeyValuePair <Document, double> > CalculateCosScores(List <string> tokens, DfWrapper wrapper) { List <KeyValuePair <int, double> > scores = new List <KeyValuePair <int, double> >(); foreach (Document doc in wrapper.DocumentList) { double tempVal = 0; foreach (string word in tokens) { if (doc.TermFrequency != 0) { tempVal += (1 + Math.Log(doc.TermFrequency)); } } scores.Add(new KeyValuePair <int, double>(doc.Id, tempVal)); } List <KeyValuePair <int, double> > length = new List <KeyValuePair <int, double> >(); foreach (Document item in wrapper.DocumentList) { double temp = 0; foreach (string word in tokens) { if (incidenceVector[word].DocumentList.Contains(item)) { temp += item.TfIdf * item.TfIdf; } } length.Add(new KeyValuePair <int, double>(item.Id, Math.Sqrt(temp))); } List <KeyValuePair <int, double> > finalScores = new List <KeyValuePair <int, double> >(); foreach (KeyValuePair <int, double> item in scores) { double lengthVal = -1; lengthVal = length.Find(x => x.Key == item.Key).Value; if (lengthVal >= 0) { finalScores.Add(new KeyValuePair <int, double>(item.Key, item.Value / lengthVal)); } } finalScores = finalScores.OrderBy(x => x.Value).ToList(); List <KeyValuePair <Document, double> > finalDocumentPairs = new List <KeyValuePair <Document, double> >(); foreach (KeyValuePair <int, double> item in finalScores) { finalDocumentPairs.Add(new KeyValuePair <Document, double>(wrapper.DocumentList.Find(x => x.Id == item.Key), item.Value)); } return(finalDocumentPairs); }
public List <KeyValuePair <Document, double> > PassQuery(string inputQuery) { List <string> disectedQuery = new List <string>(inputQuery.ToLower().Split(' ')); disectedQuery.RemoveAll(e => stopWords.Exists(sw => sw.Equals(e))); List <string> foundTokens = new List <string>(); List <Document> blacklistedDocuments = new List <Document>(); int queryTokenCounter = 0; foreach (string queryWord in disectedQuery) { if (queryWord == "*not*") { if (queryTokenCounter != disectedQuery.Count) { blacklistedDocuments.AddRange(incidenceVector[disectedQuery[queryTokenCounter + 1]].DocumentList); } } if (incidenceVector.ContainsKey(queryWord) && !foundTokens.Contains(queryWord)) { foundTokens.Add(queryWord); } queryTokenCounter++; } DfWrapper foundPages = new DfWrapper(); //foundPages.DocumentList.AddRange(ANDpageFinder(foundTokens)); foundPages.DocumentList.AddRange(ORpageFinder(foundTokens)); foundPages.DocumentList = foundPages.DocumentList.Distinct().ToList(); foundPages.DocumentList.RemoveAll(e => blacklistedDocuments.Contains(e)); //foundPages.DocumentList.ForEach(e => Console.WriteLine(e.Id)); //REMEMBER 10 IS JUST CHOSEN RANDOMLY - Hardcode CalculateTf_Idf(foundPages.DocumentList.Count); List <KeyValuePair <Document, double> > cosscore = CalculateCosScores(foundTokens, foundPages); double[] pageranks = _crawler.PageRanker(0.10, 200); List <KeyValuePair <Document, double> > final = new List <KeyValuePair <Document, double> >(); for (int i = 0; i < pageranks.Length; i++) { KeyValuePair <Document, double> temp = cosscore.Find(x => x.Key.Id == i); if (temp.Key != null) { final.Add(new KeyValuePair <Document, double>(temp.Key, pageranks[i] * temp.Value)); } } final.Sort((x, y) => x.Value.CompareTo(y.Value)); final.Reverse(); try { final = final.GetRange(0, 10); } catch (Exception) { } return(final); }