public static WebPageDocument crawl(string url) { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); request.UserAgent = "A .NET Web Crawler"; try { WebPageDocument webPage = new WebPageDocument(); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); StreamReader reader = new StreamReader(stream); string rawHtmlString = reader.ReadToEnd(); webPage.Links = Detector.linkFinder(rawHtmlString); Detector.setWebPageInformation(rawHtmlString, webPage); webPage.Url = url; return(webPage); } catch { return(null); } }
public static void setWebPageInformation(string rawHtmlString, WebPageDocument webPage) { Match match1 = Regex.Match(rawHtmlString, @"<meta name=\""description\"" content=\""(.*?)\""", RegexOptions.Singleline); if (match1.Success) { webPage.Description = match1.Groups[1].Value; } Match match2 = Regex.Match(rawHtmlString, @"<meta name=\""keywords\"" content=\""(.*?)\""", RegexOptions.Singleline); if (match2.Success) { webPage.Keywords = webPage.buildFeatures(Tokenizer.tokenize(match2.Groups[1].Value)); } Match match3 = Regex.Match(rawHtmlString, @"<title>(.*?)</title>", RegexOptions.Singleline); if (match3.Success) { webPage.Title = match3.Groups[1].Value; } MatchCollection matchCollection = Regex.Matches(rawHtmlString, @"<h\d.*?>(.*?)</h1>"); webPage.Headings = new List <Feature>(); foreach (Match m in matchCollection) { string pureText = Regex.Replace(m.Groups[1].Value, "<.*?>", String.Empty); List <string> tokens = Tokenizer.tokenize(pureText); webPage.Headings = webPage.buildFeatures(tokens); } MatchCollection matchCollection2 = Regex.Matches(rawHtmlString, @"<p.*?>(.*?)</p>", RegexOptions.Multiline); webPage.Paragraphs = new List <Feature>(); foreach (Match m in matchCollection2) { string pureText = Regex.Replace(m.Groups[1].Value, "<.*?>", String.Empty); List <string> tokens = Tokenizer.tokenize(pureText); webPage.Paragraphs = webPage.buildFeatures(tokens); } }
static void Main(string[] args) { Console.WriteLine("Alireza Khanshan - A Web Crawler"); Console.Write("How many pages? "); int ALL_PAGES = Convert.ToInt32(Console.ReadLine()); using (StreamWriter writer = new StreamWriter("C:/Crawler/Data/pages.txt")) { writer.Write(ALL_PAGES); } Console.WriteLine("crawling..."); int count = 1; List <Link> frontier = Crawler.crawl("http://espn.go.com/nba").Links; List <string> blackList = new List <string>(); List <Document> documents = new List <Document>(); List <WebPageDocument> webPageDcuments = new List <WebPageDocument>(); Console.WriteLine(count); Console.WriteLine("http://espn.go.com/nba : " + frontier.Count); count++; while (true) { if (frontier.Count > 0 && count < ALL_PAGES) { // foreach (string href in blackList) { frontier.RemoveAll(item => item.Href != null && item.Href.Equals(href)); } // Link crawlMe = frontier[0]; if (crawlMe.Href == null) { frontier.Remove(crawlMe); } else if (crawlMe.Href.ToLower().Contains("mailto")) { frontier.Remove(crawlMe); } else if (crawlMe.Href.ToLower().StartsWith("http://espn.go.com/nba")) { frontier.RemoveAll(item => item.Href == crawlMe.Href); blackList.Add(crawlMe.Href); WebPageDocument webPage = Crawler.crawl(crawlMe.Href); List <Link> newLinks = webPage != null ? webPage.Links : null; if (newLinks != null) { Console.Write(count + ". "); Console.WriteLine(webPage.Url); documents.Add(new Document() { Id = count, Description = webPage.Description, Links = newLinks.Count, Title = webPage.Title, Url = webPage.Url }); webPage.Id = count; webPageDcuments.Add(webPage); count++; foreach (Link l in newLinks) { frontier.Add(l); } } } else if (crawlMe.Href.ToLower().StartsWith("/nba/")) { frontier.RemoveAll(item => item.Href == crawlMe.Href); blackList.Add(crawlMe.Href); WebPageDocument webPage = Crawler.crawl("http://espn.go.com" + crawlMe.Href); List <Link> newLinks = webPage != null ? webPage.Links : null; if (newLinks != null) { Console.Write(count + ". "); Console.WriteLine(webPage.Url); documents.Add(new Document() { Id = count, Description = webPage.Description, Links = newLinks.Count, Title = webPage.Title, Url = webPage.Url }); webPage.Id = count; webPageDcuments.Add(webPage); count++; foreach (Link l in newLinks) { frontier.Add(l); } } } else { frontier.Remove(crawlMe); } } else { break; } } List <InvertedListItem> list = IndexBuilder.build(webPageDcuments); using (var db = new LiteDatabase(@"C:/Crawler/Data/crawler.db")) { var invertedIndex = db.GetCollection <InvertedListItem>("inverted_index"); foreach (InvertedListItem i in list) { invertedIndex.Insert(i); } var documentDataStore = db.GetCollection <Document>("document_store"); foreach (Document d in documents) { documentDataStore.Insert(d); } } Console.WriteLine("*** done ***"); Console.ReadKey(); }