//returns a tuple of the increased index size and increased url queue size public Tuple <int, int> crawlSite(string url) { int updateIndex = 0; int updateQueue = -1; try { Uri uri = new Uri(url); Host host; if (hosts.TryGetValue(uri.Host, out host)) { if (host.isAllowed(uri)) { //check if url has been visited before if (!host.hasVisited(uri)) { HtmlDocument htmlDoc; HtmlWeb web = new HtmlWeb(); htmlDoc = web.Load(uri.AbsoluteUri); if (web.StatusCode == HttpStatusCode.OK) { string title = ""; string date = DateTime.UtcNow.ToString("s", System.Globalization.CultureInfo.InvariantCulture); string body = ""; host.addVisited(uri); //get title HtmlNode metaTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='title']"); if (metaTitleNode != null) { title = metaTitleNode.GetAttributeValue("content", ""); body = title; } else { HtmlNode metaOgTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:title']"); if (metaOgTitleNode != null) { title = metaOgTitleNode.GetAttributeValue("content", ""); body = title; } else { HtmlNode titleNode = htmlDoc.DocumentNode.SelectSingleNode("//title"); if (titleNode != null) { title = HttpUtility.HtmlDecode(titleNode.InnerHtml); body = title; } } } //get last mod date of page HtmlNode modNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']"); if (modNode != null) { date = modNode.GetAttributeValue("content", ""); } else { //if no last mod date, check if there is a pub date HtmlNode pubNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']"); if (pubNode != null) { date = pubNode.GetAttributeValue("content", ""); } } //get body of page HtmlNode metaDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']"); if (metaDescNode != null) { body = metaDescNode.GetAttributeValue("content", ""); } else { HtmlNode metaOgDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:description']"); if (metaOgDescNode != null) { body = metaOgDescNode.GetAttributeValue("content", ""); } } //HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//p[contains(@class,'zn-body__paragraph')]"); //if (bodyNode != null) //{ // if (body.Length > 200) // { // body = Operation.stripHtml(bodyNode.InnerText).Substring(0, 200) + "..."; // } //} //Insert page with each word in the title as a row key string[] keyWord = Operation.stripPunct(title.ToLower()).Split().Distinct().ToArray(); foreach (string key in keyWord) { if (key != "") { try { //get page data and store to table PageEntity page = new PageEntity(uri, title, date, body, key); TableOperation insertOperation = TableOperation.Insert(page); pagesTable.Execute(insertOperation); updateIndex++; } catch (Exception e) { //Insert error to table ErrorEntity err = new ErrorEntity(url, e.Message, DateTime.Now.ToString()); TableOperation insertOperation = TableOperation.Insert(err); errorsTable.ExecuteAsync(insertOperation); Console.Write(e.ToString()); } } } HtmlNode[] linkNodes = new HtmlNode[0]; HtmlNodeCollection tempNodes = htmlDoc.DocumentNode.SelectNodes("//a"); if (tempNodes != null) { linkNodes = tempNodes.ToArray(); } Uri newUri; foreach (HtmlNode link in linkNodes) { //add url if within allowed domain try { newUri = new Uri(uri, link.GetAttributeValue("href", null)); if (Operation.domains.Values.Any(newUri.Host.Contains)) { if (newUri.Host.Contains(Operation.domains["BR1"]) || newUri.Host.Contains(Operation.domains["BR2"])) { if (newUri.AbsolutePath.StartsWith(Operation._BR_PATH)) { CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri); urlQueue.AddMessageAsync(urlMessage); updateQueue++; } } else { CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri); urlQueue.AddMessageAsync(urlMessage); updateQueue++; } } } catch (Exception e) { //Invalid url Console.WriteLine("Invalid html url found: " + e.ToString()); } } } } } } else { //if robots.txt has not been parsed for the given url site //and is within domain if (Operation.domains.Values.Any(uri.Host.Contains)) { //add to xmlqueue and add url back into urlqueue CloudQueueMessage robotMessage = new CloudQueueMessage(uri.AbsoluteUri); robotQueue.AddMessage(robotMessage); CloudQueueMessage urlMessage = new CloudQueueMessage(uri.AbsoluteUri); urlQueue.AddMessage(urlMessage); updateQueue++; } } } catch (Exception e) { //Insert error to table ErrorEntity err = new ErrorEntity(url, e.Message, DateTime.Now.ToString()); try { TableOperation insertOperation = TableOperation.Insert(err); errorsTable.ExecuteAsync(insertOperation); } catch (Exception insErr) { Console.Write(insErr.ToString()); } } return(new Tuple <int, int>(updateIndex, updateQueue)); }
public PagePair(PageEntity page, int count, HashSet <string> query) { this.page = page; this.count = count; this.queryWords = query; }