Beispiel #1
0
        public static WebPageDocument crawl(string url)
        {
            HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);

            request.UserAgent = "A .NET Web Crawler";
            try
            {
                WebPageDocument webPage       = new WebPageDocument();
                HttpWebResponse response      = (HttpWebResponse)request.GetResponse();
                Stream          stream        = response.GetResponseStream();
                StreamReader    reader        = new StreamReader(stream);
                string          rawHtmlString = reader.ReadToEnd();
                webPage.Links = Detector.linkFinder(rawHtmlString);
                Detector.setWebPageInformation(rawHtmlString, webPage);
                webPage.Url = url;
                return(webPage);
            }
            catch
            {
                return(null);
            }
        }
Beispiel #2
0
        public static void setWebPageInformation(string rawHtmlString, WebPageDocument webPage)
        {
            Match match1 = Regex.Match(rawHtmlString, @"<meta name=\""description\"" content=\""(.*?)\""", RegexOptions.Singleline);

            if (match1.Success)
            {
                webPage.Description = match1.Groups[1].Value;
            }
            Match match2 = Regex.Match(rawHtmlString, @"<meta name=\""keywords\"" content=\""(.*?)\""", RegexOptions.Singleline);

            if (match2.Success)
            {
                webPage.Keywords = webPage.buildFeatures(Tokenizer.tokenize(match2.Groups[1].Value));
            }
            Match match3 = Regex.Match(rawHtmlString, @"<title>(.*?)</title>", RegexOptions.Singleline);

            if (match3.Success)
            {
                webPage.Title = match3.Groups[1].Value;
            }
            MatchCollection matchCollection = Regex.Matches(rawHtmlString, @"<h\d.*?>(.*?)</h1>");

            webPage.Headings = new List <Feature>();
            foreach (Match m in matchCollection)
            {
                string        pureText = Regex.Replace(m.Groups[1].Value, "<.*?>", String.Empty);
                List <string> tokens   = Tokenizer.tokenize(pureText);
                webPage.Headings = webPage.buildFeatures(tokens);
            }
            MatchCollection matchCollection2 = Regex.Matches(rawHtmlString, @"<p.*?>(.*?)</p>", RegexOptions.Multiline);

            webPage.Paragraphs = new List <Feature>();
            foreach (Match m in matchCollection2)
            {
                string        pureText = Regex.Replace(m.Groups[1].Value, "<.*?>", String.Empty);
                List <string> tokens   = Tokenizer.tokenize(pureText);
                webPage.Paragraphs = webPage.buildFeatures(tokens);
            }
        }
Beispiel #3
0
        static void Main(string[] args)
        {
            Console.WriteLine("Alireza Khanshan - A Web Crawler");
            Console.Write("How many pages? ");
            int ALL_PAGES = Convert.ToInt32(Console.ReadLine());

            using (StreamWriter writer =
                       new StreamWriter("C:/Crawler/Data/pages.txt"))
            {
                writer.Write(ALL_PAGES);
            }
            Console.WriteLine("crawling...");
            int                    count           = 1;
            List <Link>            frontier        = Crawler.crawl("http://espn.go.com/nba").Links;
            List <string>          blackList       = new List <string>();
            List <Document>        documents       = new List <Document>();
            List <WebPageDocument> webPageDcuments = new List <WebPageDocument>();

            Console.WriteLine(count);
            Console.WriteLine("http://espn.go.com/nba : " + frontier.Count);
            count++;
            while (true)
            {
                if (frontier.Count > 0 && count < ALL_PAGES)
                {
                    //
                    foreach (string href in blackList)
                    {
                        frontier.RemoveAll(item => item.Href != null && item.Href.Equals(href));
                    }
                    //
                    Link crawlMe = frontier[0];
                    if (crawlMe.Href == null)
                    {
                        frontier.Remove(crawlMe);
                    }
                    else if (crawlMe.Href.ToLower().Contains("mailto"))
                    {
                        frontier.Remove(crawlMe);
                    }
                    else if (crawlMe.Href.ToLower().StartsWith("http://espn.go.com/nba"))
                    {
                        frontier.RemoveAll(item => item.Href == crawlMe.Href);
                        blackList.Add(crawlMe.Href);
                        WebPageDocument webPage  = Crawler.crawl(crawlMe.Href);
                        List <Link>     newLinks = webPage != null ? webPage.Links : null;
                        if (newLinks != null)
                        {
                            Console.Write(count + ". ");
                            Console.WriteLine(webPage.Url);
                            documents.Add(new Document()
                            {
                                Id          = count,
                                Description = webPage.Description,
                                Links       = newLinks.Count,
                                Title       = webPage.Title,
                                Url         = webPage.Url
                            });
                            webPage.Id = count;
                            webPageDcuments.Add(webPage);
                            count++;
                            foreach (Link l in newLinks)
                            {
                                frontier.Add(l);
                            }
                        }
                    }
                    else if (crawlMe.Href.ToLower().StartsWith("/nba/"))
                    {
                        frontier.RemoveAll(item => item.Href == crawlMe.Href);
                        blackList.Add(crawlMe.Href);
                        WebPageDocument webPage  = Crawler.crawl("http://espn.go.com" + crawlMe.Href);
                        List <Link>     newLinks = webPage != null ? webPage.Links : null;
                        if (newLinks != null)
                        {
                            Console.Write(count + ". ");
                            Console.WriteLine(webPage.Url);
                            documents.Add(new Document()
                            {
                                Id          = count,
                                Description = webPage.Description,
                                Links       = newLinks.Count,
                                Title       = webPage.Title,
                                Url         = webPage.Url
                            });
                            webPage.Id = count;
                            webPageDcuments.Add(webPage);
                            count++;
                            foreach (Link l in newLinks)
                            {
                                frontier.Add(l);
                            }
                        }
                    }
                    else
                    {
                        frontier.Remove(crawlMe);
                    }
                }
                else
                {
                    break;
                }
            }
            List <InvertedListItem> list = IndexBuilder.build(webPageDcuments);

            using (var db = new LiteDatabase(@"C:/Crawler/Data/crawler.db"))
            {
                var invertedIndex = db.GetCollection <InvertedListItem>("inverted_index");
                foreach (InvertedListItem i in list)
                {
                    invertedIndex.Insert(i);
                }
                var documentDataStore = db.GetCollection <Document>("document_store");
                foreach (Document d in documents)
                {
                    documentDataStore.Insert(d);
                }
            }
            Console.WriteLine("*** done ***");
            Console.ReadKey();
        }