Ejemplo n.º 1
0
        private void UpdateRecentUrl(string url)
        {
            TableOperation recentInsert = TableOperation.Insert(new RecentUrlEntity(url));

            StorageManager.RecentTenTable().Execute(recentInsert);
        }
Ejemplo n.º 2
0
        public void GetHTMLData()
        {
            HtmlWeb      htmlWeb = new HtmlWeb();
            HtmlDocument webpage = new HtmlDocument();

            new Task(GetPerfCounters).Start();
            Thread.Sleep(100);
            CloudQueueMessage htmllink = StorageManager.HTMLQueue().GetMessage();
            string            url      = htmllink.AsString;

            if (!IsDisallow(url) && htmllink != null)
            {
                using (var client = new WebClient())
                {
                    try
                    {
                        webpage.LoadHtml(client.DownloadString(url));
                        DateTime pubdlication;
                        Uri      currentUri  = new Uri(url);
                        HtmlNode pubdateHTML = webpage.DocumentNode.SelectSingleNode("//head/meta[@name='lastmod']");
                        if (pubdateHTML != null)
                        {
                            pubdlication = DateTime.Parse(pubdateHTML.Attributes["content"].Value);
                        }
                        else
                        {
                            pubdlication = DateTime.Today;
                        }
                        UpdateRecentUrl(url);
                        string title = webpage.DocumentNode.SelectSingleNode("//head/title").InnerText ?? "";
                        foreach (string word in CleanUpTitle(title))
                        {
                            TableOperation insertOp = TableOperation.InsertOrReplace(new PageEntity(word, title, url, pubdlication));
                            StorageManager.GetTable().Execute(insertOp);
                        }
                        HtmlNodeCollection href = webpage.DocumentNode.SelectNodes("//a[@href]");
                        if (href != null)
                        {
                            foreach (HtmlNode linkNode in href)
                            {
                                string templink = linkNode.Attributes["href"].Value;
                                if (templink.StartsWith("//"))
                                {
                                    templink = "http:" + templink;
                                }
                                else if (templink.StartsWith("/"))
                                {
                                    templink = "http://" + currentUri.Host + templink;
                                }
                                if (!HtmlSet.Contains(templink))
                                {
                                    HtmlSet.Add(templink);
                                    StorageManager.HTMLQueue().AddMessageAsync(new CloudQueueMessage(templink));
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        if (!HtmlSet.Contains(htmllink.AsString))
                        {
                            HtmlSet.Add(htmllink.AsString);
                            TableOperation insertOp = TableOperation.InsertOrReplace(new ErrorMessage(htmllink.AsString, e.Message));
                            StorageManager.ErrorTable().Execute(insertOp);
                        }
                    }
                }
            }
            StorageManager.HTMLQueue().DeleteMessage(htmllink);
        }
Ejemplo n.º 3
0
        //reads through valid XML links and add their HTML links to the Url Queue
        public void readXMLUrl(string url)
        {
            settings.DtdProcessing = DtdProcessing.Parse;

            XmlReader reader = XmlReader.Create(url, settings);

            reader.MoveToContent();
            while (reader.Read())
            {
                //add performance with no changes in queue size, index size, or number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex);

                if (reader.NodeType == XmlNodeType.Element)
                {
                    if (reader.Name == "loc")
                    {
                        reader.Read();

                        if (reader.NodeType == XmlNodeType.Text && !reader.Value.Contains("xml"))
                        {
                            bool restricted = checkDisallow(reader.Value, Disallow);

                            //add html urls to list to be put into HTML queue
                            if (!restricted)
                            {
                                if (!this.Visited.Contains(reader.Value))
                                {
                                    if (!checkDisallow(reader.Value, this.Disallow))
                                    {
                                        if (!this.Visited.Contains(reader.Value))
                                        {
                                            addToUrlQueue(reader.Value);
                                        }
                                    }
                                }
                            }
                            else if ((reader.Value.Contains("xml") && (reader.Value.Contains("2018")) || reader.Value.Contains("nba.xml")))
                            {
                                //add xml urls to list to be put back into XML queue
                                this.XMLurls.Add(reader.Value);

                                CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value);
                                StorageManager.getXMLQueue().AddMessage(newXMLMessage);
                            }
                        }
                    }
                }
            }
        }
Ejemplo n.º 4
0
        //parse through HTML urls, index the information, add them to the Url table. Follow each link accompanied by it, check its validity and visit status, then add back to queue.
        public void parseHTML(string url)
        {
            if (crawlable)
            {
                //update status, increase number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled + 1;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex);
                try
                {
                    //index information
                    var    Url   = url;
                    var    web   = new HtmlWeb();
                    var    doc   = web.Load(url);
                    var    title = doc.DocumentNode.SelectSingleNode("//head/title").InnerHtml;
                    var    meta  = doc.DocumentNode.SelectNodes("//meta");
                    string date  = "no date found";
                    foreach (HtmlNode tag in meta)
                    {
                        string property = tag.GetAttributeValue("property", "");
                        if (property.Contains("published_time") || property.Contains("pubdate"))
                        {
                            date = tag.GetAttributeValue("content", "");
                        }
                    }

                    //add to table
                    addToTable(Url, title, date);

                    //check header links that relate to the link
                    var linksList = doc.DocumentNode.SelectNodes("//head/link");
                    if (linksList != null)
                    {
                        foreach (HtmlNode link in linksList)
                        {
                            string href = link.GetAttributeValue("href", "");
                            if (href.Contains("cnn.com") || href.Contains("bleacherreport.com/articles") || href.Contains("bleacherreport.com/nba"))
                            {
                                //Debug.WriteLine("new link: ");
                                //Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        addToUrlQueue(href);
                                    }
                                }
                            }
                        }
                    }

                    //check body links that relate to the link
                    var aList = doc.DocumentNode.SelectNodes("//a[@href]");
                    if (aList != null)
                    {
                        foreach (HtmlNode a in aList)
                        {
                            string href = a.GetAttributeValue("href", "");
                            if (href.Contains("cnn.com") || href.Contains("bleacherreport.com/articles") || href.Contains("bleacherreport.com/nba"))
                            {
                                //Debug.WriteLine("new link: ");
                                //Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        addToUrlQueue(href);
                                    }
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    //put in exception table with URL
                    ExceptionUrl except = new ClassLibrary1.ExceptionUrl(e.ToString(), url);

                    //add exception to table
                    TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(except);
                    StorageManager.getExceptionTable().Execute(insertOrReplaceOperation);
                }
            }
            else
            {
                return;
            }
        }
Ejemplo n.º 5
0
        //reads through valid XML links and add their HTML links to the Url Queue
        public void readXMLUrl(string url)
        {
            settings.DtdProcessing = DtdProcessing.Parse;

            XmlReader reader = XmlReader.Create(url, settings);

            reader.MoveToContent();
            while (reader.Read())
            {
                Performance.insertPerformance("Loading");
                if (reader.NodeType == XmlNodeType.Element)
                {
                    if (reader.Name == "loc")
                    {
                        reader.Read();

                        if (reader.NodeType == XmlNodeType.Text)
                        {
                            if (reader.Value.Contains("html") || reader.Value.Contains("htm"))
                            {
                                bool restricted = checkDisallow(reader.Value, Disallow);

                                //add html urls to list to be put into HTML queue
                                if (!restricted)
                                {
                                    if (!this.Visited.Contains(reader.Value))
                                    {
                                        if (!checkDisallow(reader.Value, this.Disallow))
                                        {
                                            if (!this.Visited.Contains(reader.Value))
                                            {
                                                SizeCounter.SizeQueue++;

                                                addToUrlQueue(reader.Value);

                                                CloudQueueMessage message = new CloudQueueMessage(SizeCounter.SizeQueue.ToString());

                                                StorageManager.getNumQueue().AddMessage(message);


                                                Performance.insertPerformance("Crawling");
                                            }
                                        }
                                    }
                                }
                            }
                            else if (reader.Value.Contains("xml") && (reader.Value.Contains("2018") || reader.Value.Contains("2017/12") || reader.Value.Contains("2017-12")))
                            {
                                //add xml urls to list to be put back into XML queue
                                this.XMLurls.Add(reader.Value);


                                Performance.insertPerformance("Loading");


                                CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value);
                                StorageManager.getXMLQueue().AddMessage(newXMLMessage);
                            }
                        }
                    }
                }
            }
        }
Ejemplo n.º 6
0
        //parse through HTML urls, index the information, add them to the Url table. Follow each link accompanied by it, check its validity and visit status, then add back to queue.
        public void parseHTML(string url)
        {
            Performance.insertPerformance("Crawling");

            CloudQueueMessage message = new CloudQueueMessage(SizeCounter.SizeIndex.ToString());

            StorageManager.getNumIndex().AddMessage(message);

            if (crawlable)
            {
                SizeCounter.NumCrawled += 1;

                CloudQueueMessage message2 = new CloudQueueMessage(SizeCounter.NumCrawled.ToString());

                StorageManager.getNumCrawled().AddMessage(message2);

                Performance.insertPerformance("Crawling");
                try
                {
                    //index information
                    var    Url   = url;
                    var    web   = new HtmlWeb();
                    var    doc   = web.Load(url);
                    var    title = doc.DocumentNode.SelectSingleNode("//head/title").InnerHtml;
                    var    meta  = doc.DocumentNode.SelectNodes("//meta");
                    string date  = "no date found";
                    foreach (HtmlNode tag in meta)
                    {
                        string property = tag.GetAttributeValue("property", "");
                        if (property.Contains("published_time") || property.Contains("pubdate"))
                        {
                            date = tag.GetAttributeValue("content", "");
                        }
                    }

                    //add to table
                    addToTable(Url, title, date);

                    //check header links that relate to the link
                    var linksList = doc.DocumentNode.SelectNodes("//head/link");
                    if (linksList != null)
                    {
                        foreach (HtmlNode link in linksList)
                        {
                            string href = link.GetAttributeValue("href", "");
                            if ((href.Contains("cnn.com") || (href.Contains("bleacherreport.com") && href.Contains("nba"))) && (href.Contains("html") || href.Contains("htm")))
                            {
                                Debug.WriteLine("new link: ");
                                Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        SizeCounter.SizeQueue++;
                                        addToUrlQueue(href);

                                        CloudQueueMessage message3 = new CloudQueueMessage(SizeCounter.SizeQueue.ToString());

                                        StorageManager.getNumQueue().AddMessage(message3);

                                        Performance.insertPerformance("Crawling");
                                    }
                                }
                            }
                        }
                    }

                    //check body links that relate to the link
                    var aList = doc.DocumentNode.SelectNodes("//a[@href]");
                    if (aList != null)
                    {
                        foreach (HtmlNode a in aList)
                        {
                            string href = a.GetAttributeValue("href", "");
                            if ((href.Contains("cnn.com") || (href.Contains("bleacherreport.com") && href.Contains("nba"))) && (href.Contains("html") || href.Contains("htm")))
                            {
                                Debug.WriteLine("new link: ");
                                Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        SizeCounter.SizeQueue++;
                                        addToUrlQueue(href);
                                        CloudQueueMessage message4 = new CloudQueueMessage(SizeCounter.SizeQueue.ToString());

                                        StorageManager.getNumQueue().AddMessage(message4);

                                        Performance.insertPerformance("Crawling");
                                    }
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    //put in exception table with URL
                    ExceptionUrl except = new ClassLibrary1.ExceptionUrl(e.ToString(), url);

                    //add exception to table
                    TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(except);
                    StorageManager.getExceptionTable().Execute(insertOrReplaceOperation);
                }
            }
            else
            {
                return;
            }
        }