Beispiel #1
0
        //returns a tuple of the increased index size and increased url queue size
        public Tuple <int, int> crawlSite(string url)
        {
            int updateIndex = 0;
            int updateQueue = -1;

            try
            {
                Uri  uri = new Uri(url);
                Host host;

                if (hosts.TryGetValue(uri.Host, out host))
                {
                    if (host.isAllowed(uri))
                    {
                        //check if url has been visited before
                        if (!host.hasVisited(uri))
                        {
                            HtmlDocument htmlDoc;

                            HtmlWeb web = new HtmlWeb();
                            htmlDoc = web.Load(uri.AbsoluteUri);

                            if (web.StatusCode == HttpStatusCode.OK)
                            {
                                string title = "";
                                string date  = DateTime.UtcNow.ToString("s", System.Globalization.CultureInfo.InvariantCulture);
                                string body  = "";

                                host.addVisited(uri);

                                //get title
                                HtmlNode metaTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='title']");
                                if (metaTitleNode != null)
                                {
                                    title = metaTitleNode.GetAttributeValue("content", "");
                                    body  = title;
                                }
                                else
                                {
                                    HtmlNode metaOgTitleNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:title']");
                                    if (metaOgTitleNode != null)
                                    {
                                        title = metaOgTitleNode.GetAttributeValue("content", "");
                                        body  = title;
                                    }
                                    else
                                    {
                                        HtmlNode titleNode = htmlDoc.DocumentNode.SelectSingleNode("//title");
                                        if (titleNode != null)
                                        {
                                            title = HttpUtility.HtmlDecode(titleNode.InnerHtml);
                                            body  = title;
                                        }
                                    }
                                }

                                //get last mod date of page
                                HtmlNode modNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']");
                                if (modNode != null)
                                {
                                    date = modNode.GetAttributeValue("content", "");
                                }
                                else
                                {
                                    //if no last mod date, check if there is a pub date
                                    HtmlNode pubNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']");
                                    if (pubNode != null)
                                    {
                                        date = pubNode.GetAttributeValue("content", "");
                                    }
                                }

                                //get body of page
                                HtmlNode metaDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                                if (metaDescNode != null)
                                {
                                    body = metaDescNode.GetAttributeValue("content", "");
                                }
                                else
                                {
                                    HtmlNode metaOgDescNode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='og:description']");
                                    if (metaOgDescNode != null)
                                    {
                                        body = metaOgDescNode.GetAttributeValue("content", "");
                                    }
                                }

                                //HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//p[contains(@class,'zn-body__paragraph')]");
                                //if (bodyNode != null)
                                //{
                                //    if (body.Length > 200)
                                //    {
                                //        body = Operation.stripHtml(bodyNode.InnerText).Substring(0, 200) + "...";
                                //    }
                                //}

                                //Insert page with each word in the title as a row key
                                string[] keyWord = Operation.stripPunct(title.ToLower()).Split().Distinct().ToArray();
                                foreach (string key in keyWord)
                                {
                                    if (key != "")
                                    {
                                        try
                                        {
                                            //get page data and store to table
                                            PageEntity     page            = new PageEntity(uri, title, date, body, key);
                                            TableOperation insertOperation = TableOperation.Insert(page);
                                            pagesTable.Execute(insertOperation);
                                            updateIndex++;
                                        }
                                        catch (Exception e)
                                        {
                                            //Insert error to table
                                            ErrorEntity    err             = new ErrorEntity(url, e.Message, DateTime.Now.ToString());
                                            TableOperation insertOperation = TableOperation.Insert(err);
                                            errorsTable.ExecuteAsync(insertOperation);

                                            Console.Write(e.ToString());
                                        }
                                    }
                                }

                                HtmlNode[]         linkNodes = new HtmlNode[0];
                                HtmlNodeCollection tempNodes = htmlDoc.DocumentNode.SelectNodes("//a");
                                if (tempNodes != null)
                                {
                                    linkNodes = tempNodes.ToArray();
                                }
                                Uri newUri;
                                foreach (HtmlNode link in linkNodes)
                                {
                                    //add url if within allowed domain
                                    try
                                    {
                                        newUri = new Uri(uri, link.GetAttributeValue("href", null));
                                        if (Operation.domains.Values.Any(newUri.Host.Contains))
                                        {
                                            if (newUri.Host.Contains(Operation.domains["BR1"]) || newUri.Host.Contains(Operation.domains["BR2"]))
                                            {
                                                if (newUri.AbsolutePath.StartsWith(Operation._BR_PATH))
                                                {
                                                    CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri);
                                                    urlQueue.AddMessageAsync(urlMessage);
                                                    updateQueue++;
                                                }
                                            }
                                            else
                                            {
                                                CloudQueueMessage urlMessage = new CloudQueueMessage(newUri.AbsoluteUri);
                                                urlQueue.AddMessageAsync(urlMessage);
                                                updateQueue++;
                                            }
                                        }
                                    }
                                    catch (Exception e)
                                    {
                                        //Invalid url
                                        Console.WriteLine("Invalid html url found: " + e.ToString());
                                    }
                                }
                            }
                        }
                    }
                }
                else
                {
                    //if robots.txt has not been parsed for the given url site
                    //and is within domain
                    if (Operation.domains.Values.Any(uri.Host.Contains))
                    {
                        //add to xmlqueue and add url back into urlqueue
                        CloudQueueMessage robotMessage = new CloudQueueMessage(uri.AbsoluteUri);
                        robotQueue.AddMessage(robotMessage);

                        CloudQueueMessage urlMessage = new CloudQueueMessage(uri.AbsoluteUri);
                        urlQueue.AddMessage(urlMessage);
                        updateQueue++;
                    }
                }
            }
            catch (Exception e)
            {
                //Insert error to table
                ErrorEntity err = new ErrorEntity(url, e.Message, DateTime.Now.ToString());
                try
                {
                    TableOperation insertOperation = TableOperation.Insert(err);
                    errorsTable.ExecuteAsync(insertOperation);
                }
                catch (Exception insErr)
                {
                    Console.Write(insErr.ToString());
                }
            }
            return(new Tuple <int, int>(updateIndex, updateQueue));
        }
 public PagePair(PageEntity page, int count, HashSet <string> query)
 {
     this.page       = page;
     this.count      = count;
     this.queryWords = query;
 }