Exemplo n.º 1
0
        //add to the UrlTable
        public void addToTable(string url, string title, string date)
        {
            WebPage page = new ClassLibrary1.WebPage(url, title, date);

            //add page to table
            TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(page);

            StorageManager.getTable().Execute(insertOrReplaceOperation);

            SizeCounter.SizeIndex += 1;

            CloudQueueMessage message2 = new CloudQueueMessage(SizeCounter.SizeIndex.ToString());

            StorageManager.getNumIndex().AddMessage(message2);

            Performance.insertPerformance("Crawling");
        }
Exemplo n.º 2
0
        //parse through HTML urls, index the information, add them to the Url table. Follow each link accompanied by it, check its validity and visit status, then add back to queue.
        public void parseHTML(string url)
        {
            Performance.insertPerformance("Crawling");

            CloudQueueMessage message = new CloudQueueMessage(SizeCounter.SizeIndex.ToString());

            StorageManager.getNumIndex().AddMessage(message);

            if (crawlable)
            {
                SizeCounter.NumCrawled += 1;

                CloudQueueMessage message2 = new CloudQueueMessage(SizeCounter.NumCrawled.ToString());

                StorageManager.getNumCrawled().AddMessage(message2);

                Performance.insertPerformance("Crawling");
                try
                {
                    //index information
                    var    Url   = url;
                    var    web   = new HtmlWeb();
                    var    doc   = web.Load(url);
                    var    title = doc.DocumentNode.SelectSingleNode("//head/title").InnerHtml;
                    var    meta  = doc.DocumentNode.SelectNodes("//meta");
                    string date  = "no date found";
                    foreach (HtmlNode tag in meta)
                    {
                        string property = tag.GetAttributeValue("property", "");
                        if (property.Contains("published_time") || property.Contains("pubdate"))
                        {
                            date = tag.GetAttributeValue("content", "");
                        }
                    }

                    //add to table
                    addToTable(Url, title, date);

                    //check header links that relate to the link
                    var linksList = doc.DocumentNode.SelectNodes("//head/link");
                    if (linksList != null)
                    {
                        foreach (HtmlNode link in linksList)
                        {
                            string href = link.GetAttributeValue("href", "");
                            if ((href.Contains("cnn.com") || (href.Contains("bleacherreport.com") && href.Contains("nba"))) && (href.Contains("html") || href.Contains("htm")))
                            {
                                Debug.WriteLine("new link: ");
                                Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        SizeCounter.SizeQueue++;
                                        addToUrlQueue(href);

                                        CloudQueueMessage message3 = new CloudQueueMessage(SizeCounter.SizeQueue.ToString());

                                        StorageManager.getNumQueue().AddMessage(message3);

                                        Performance.insertPerformance("Crawling");
                                    }
                                }
                            }
                        }
                    }

                    //check body links that relate to the link
                    var aList = doc.DocumentNode.SelectNodes("//a[@href]");
                    if (aList != null)
                    {
                        foreach (HtmlNode a in aList)
                        {
                            string href = a.GetAttributeValue("href", "");
                            if ((href.Contains("cnn.com") || (href.Contains("bleacherreport.com") && href.Contains("nba"))) && (href.Contains("html") || href.Contains("htm")))
                            {
                                Debug.WriteLine("new link: ");
                                Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        SizeCounter.SizeQueue++;
                                        addToUrlQueue(href);
                                        CloudQueueMessage message4 = new CloudQueueMessage(SizeCounter.SizeQueue.ToString());

                                        StorageManager.getNumQueue().AddMessage(message4);

                                        Performance.insertPerformance("Crawling");
                                    }
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    //put in exception table with URL
                    ExceptionUrl except = new ClassLibrary1.ExceptionUrl(e.ToString(), url);

                    //add exception to table
                    TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(except);
                    StorageManager.getExceptionTable().Execute(insertOrReplaceOperation);
                }
            }
            else
            {
                return;
            }
        }