public List <KeyValuePair <int, double> > GetPagesWithWords(string words, Dictionary <string, Dictionary <int, double> > index) { List <string> split = i.RemoveStopWords(words.Split(new char[0], StringSplitOptions.RemoveEmptyEntries).ToList()); Dictionary <string, Dictionary <int, double> > output, tf, tfidf = new Dictionary <string, Dictionary <int, double> >(); Dictionary <string, double> idf = new Dictionary <string, double>(); List <KeyValuePair <int, double> > pages = new List <KeyValuePair <int, double> >(); Dictionary <int, List <double> > vectors = new Dictionary <int, List <double> >(); tf = tfCalc(index); idf = idfCalc(index); tfidf = tfidfCalc(tf, idf); // implement vector compare // Use tfidf comparison output = tfidf.Where(x => split.Any(z => z == x.Key)).ToDictionary(x => x.Key, x => x.Value); // Make vectors //vectors = CreateVectors(output, tfidf); // Make vector comparison if (output.Count > 0) { pages = output[output.Keys.First()].ToList(); foreach (var key in output.Keys.ToList()) { pages = pages.Where(x => output[key].ContainsKey(x.Key)).ToList(); } pages.Sort((pair1, pair2) => pair2.Value.CompareTo(pair1.Value)); } return(pages); }
private void FetchData(string url) { List <string> hyperlinks = new List <string>(); string content = string.Empty; HtmlWeb web = new HtmlWeb(); HtmlDocument doc; try { doc = web.Load(url); } catch (Exception) { return; } Task hyperlinkTask = Task.Run(() => { HtmlNodeCollection hyperNodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (!(hyperNodes == null)) { foreach (HtmlNode link in hyperNodes) { string href = string.Empty; try { href = link.OuterHtml.Split("\"")[1]; } catch (IndexOutOfRangeException) { continue; } if (href.StartsWith("/")) { href = url + href.Substring(1); } if (href.StartsWith("http")) { hyperlinks.Add(href); } } } SortHyperLinks(new Uri(url).Host, hyperlinks); }); // Preprocessering the content Task preprocesseringTask = Task.Run(() => { HtmlNodeCollection contentNodes = doc.DocumentNode.SelectNodes("//body"); if (!(contentNodes == null)) { foreach (HtmlNode text in doc.DocumentNode.SelectNodes("//body")) { if (!string.IsNullOrWhiteSpace(text.InnerText)) { content += text.InnerText.Trim().Replace(" ", ""); } } } content = Regex.Replace(content, @"\s+", " "); Regex rgx = new Regex("[^a-zA-Z0-9 ÆØÅ æøå -]"); content = rgx.Replace(content, ""); content = Indexer.RemoveStopWords(content.ToLower()); ContentHandler.AddContent(content, url); }); }