예제 #1
0
        public List <KeyValuePair <int, double> > GetPagesWithWords(string words, Dictionary <string, Dictionary <int, double> > index)
        {
            List <string> split = i.RemoveStopWords(words.Split(new char[0], StringSplitOptions.RemoveEmptyEntries).ToList());
            Dictionary <string, Dictionary <int, double> > output, tf, tfidf = new Dictionary <string, Dictionary <int, double> >();
            Dictionary <string, double>        idf = new Dictionary <string, double>();
            List <KeyValuePair <int, double> > pages = new List <KeyValuePair <int, double> >();
            Dictionary <int, List <double> >   vectors = new Dictionary <int, List <double> >();

            tf    = tfCalc(index);
            idf   = idfCalc(index);
            tfidf = tfidfCalc(tf, idf);
            // implement vector compare

            // Use tfidf comparison
            output = tfidf.Where(x => split.Any(z => z == x.Key)).ToDictionary(x => x.Key, x => x.Value);

            // Make vectors
            //vectors = CreateVectors(output, tfidf);

            // Make vector comparison

            if (output.Count > 0)
            {
                pages = output[output.Keys.First()].ToList();
                foreach (var key in output.Keys.ToList())
                {
                    pages = pages.Where(x => output[key].ContainsKey(x.Key)).ToList();
                }
                pages.Sort((pair1, pair2) => pair2.Value.CompareTo(pair1.Value));
            }
            return(pages);
        }
예제 #2
0
        private void FetchData(string url)
        {
            List <string> hyperlinks = new List <string>();
            string        content    = string.Empty;

            HtmlWeb      web = new HtmlWeb();
            HtmlDocument doc;

            try
            {
                doc = web.Load(url);
            }
            catch (Exception)
            {
                return;
            }

            Task hyperlinkTask = Task.Run(() =>
            {
                HtmlNodeCollection hyperNodes = doc.DocumentNode.SelectNodes("//a[@href]");

                if (!(hyperNodes == null))
                {
                    foreach (HtmlNode link in hyperNodes)
                    {
                        string href = string.Empty;

                        try
                        {
                            href = link.OuterHtml.Split("\"")[1];
                        }
                        catch (IndexOutOfRangeException)
                        {
                            continue;
                        }

                        if (href.StartsWith("/"))
                        {
                            href = url + href.Substring(1);
                        }

                        if (href.StartsWith("http"))
                        {
                            hyperlinks.Add(href);
                        }
                    }
                }

                SortHyperLinks(new Uri(url).Host, hyperlinks);
            });

            // Preprocessering the content
            Task preprocesseringTask = Task.Run(() =>
            {
                HtmlNodeCollection contentNodes = doc.DocumentNode.SelectNodes("//body");

                if (!(contentNodes == null))
                {
                    foreach (HtmlNode text in doc.DocumentNode.SelectNodes("//body"))
                    {
                        if (!string.IsNullOrWhiteSpace(text.InnerText))
                        {
                            content += text.InnerText.Trim().Replace("&nbsp", "");
                        }
                    }
                }

                content   = Regex.Replace(content, @"\s+", " ");
                Regex rgx = new Regex("[^a-zA-Z0-9 ÆØÅ æøå -]");
                content   = rgx.Replace(content, "");
                content   = Indexer.RemoveStopWords(content.ToLower());
                ContentHandler.AddContent(content, url);
            });
        }