Exemple #1
0
        //add to the UrlTable
        public void addToTable(string url, string title, string date)
        {
            string[] keywords = title.Split(' ');

            foreach (string keyword in keywords)
            {
                string keywordTrim = keyword.Trim(new char[] { ' ', '-', '\'', '\"', ',' }).ToLower();
                //Debug.WriteLine(keyword);
                WebPage page = new WebPage(url, title, keywordTrim, date);
                //add page to table
                if (keywordTrim != " " && keywordTrim != null && keywordTrim != "")
                {
                    TableOperation insertOperation = TableOperation.Insert(page);
                    StorageManager.getTable().Execute(insertOperation);
                }

                //add performance and increase index counter
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex + 1;
                }

                Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex);
            }
        }
Exemple #2
0
        public static void insertPerformance(string status, int crawlNum, int queueSize, int indexSize)
        {
            PerformanceCounter theCPUCounter = new PerformanceCounter("Processor", "% Processor Time", "_Total");
            var cpu = theCPUCounter.NextValue();

            Thread.Sleep(100);
            var cpu2 = theCPUCounter.NextValue();

            PerformanceCounter theMemCounter = new PerformanceCounter("Memory", "Available MBytes");
            var ram = theMemCounter.NextValue();

            //Create performance entries
            Performance performance = new ClassLibrary1.Performance(status, cpu2, ram, crawlNum, queueSize, indexSize);

            //add exception to table
            TableOperation insertOperation = TableOperation.Insert(performance);

            StorageManager.getPerformanceTable().Execute(insertOperation);
        }
Exemple #3
0
        public void addToUrlQueue(string url)
        {
            this.Visited.Add(url);
            CloudQueueMessage newHTML = new CloudQueueMessage(url);

            StorageManager.getUrlQueue().AddMessage(newHTML);

            //add performance and increase queue counter
            var crawled   = 0;
            var sizeQueue = 0;
            var sizeIndex = 0;
            TableQuery <Performance> query3 = new TableQuery <Performance>()
                                              .Take(1);

            foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
            {
                crawled   = item.NumCrawled;
                sizeQueue = item.SizeQueue + 1;
                sizeIndex = item.SizeIndex;
            }

            Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex);
        }
Exemple #4
0
        //reads through valid XML links and add their HTML links to the Url Queue
        public void readXMLUrl(string url)
        {
            settings.DtdProcessing = DtdProcessing.Parse;

            XmlReader reader = XmlReader.Create(url, settings);

            reader.MoveToContent();
            while (reader.Read())
            {
                //add performance with no changes in queue size, index size, or number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex);

                if (reader.NodeType == XmlNodeType.Element)
                {
                    if (reader.Name == "loc")
                    {
                        reader.Read();

                        if (reader.NodeType == XmlNodeType.Text && !reader.Value.Contains("xml"))
                        {
                            bool restricted = checkDisallow(reader.Value, Disallow);

                            //add html urls to list to be put into HTML queue
                            if (!restricted)
                            {
                                if (!this.Visited.Contains(reader.Value))
                                {
                                    if (!checkDisallow(reader.Value, this.Disallow))
                                    {
                                        if (!this.Visited.Contains(reader.Value))
                                        {
                                            addToUrlQueue(reader.Value);
                                        }
                                    }
                                }
                            }
                            else if ((reader.Value.Contains("xml") && (reader.Value.Contains("2018")) || reader.Value.Contains("nba.xml")))
                            {
                                //add xml urls to list to be put back into XML queue
                                this.XMLurls.Add(reader.Value);

                                CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value);
                                StorageManager.getXMLQueue().AddMessage(newXMLMessage);
                            }
                        }
                    }
                }
            }
        }
Exemple #5
0
        //parse through HTML urls, index the information, add them to the Url table. Follow each link accompanied by it, check its validity and visit status, then add back to queue.
        public void parseHTML(string url)
        {
            if (crawlable)
            {
                //update status, increase number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled + 1;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex);
                try
                {
                    //index information
                    var    Url   = url;
                    var    web   = new HtmlWeb();
                    var    doc   = web.Load(url);
                    var    title = doc.DocumentNode.SelectSingleNode("//head/title").InnerHtml;
                    var    meta  = doc.DocumentNode.SelectNodes("//meta");
                    string date  = "no date found";
                    foreach (HtmlNode tag in meta)
                    {
                        string property = tag.GetAttributeValue("property", "");
                        if (property.Contains("published_time") || property.Contains("pubdate"))
                        {
                            date = tag.GetAttributeValue("content", "");
                        }
                    }

                    //add to table
                    addToTable(Url, title, date);

                    //check header links that relate to the link
                    var linksList = doc.DocumentNode.SelectNodes("//head/link");
                    if (linksList != null)
                    {
                        foreach (HtmlNode link in linksList)
                        {
                            string href = link.GetAttributeValue("href", "");
                            if (href.Contains("cnn.com") || href.Contains("bleacherreport.com/articles") || href.Contains("bleacherreport.com/nba"))
                            {
                                //Debug.WriteLine("new link: ");
                                //Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        addToUrlQueue(href);
                                    }
                                }
                            }
                        }
                    }

                    //check body links that relate to the link
                    var aList = doc.DocumentNode.SelectNodes("//a[@href]");
                    if (aList != null)
                    {
                        foreach (HtmlNode a in aList)
                        {
                            string href = a.GetAttributeValue("href", "");
                            if (href.Contains("cnn.com") || href.Contains("bleacherreport.com/articles") || href.Contains("bleacherreport.com/nba"))
                            {
                                //Debug.WriteLine("new link: ");
                                //Debug.WriteLine(href);
                                if (!checkDisallow(href, this.Disallow))
                                {
                                    if (!this.Visited.Contains(href))
                                    {
                                        addToUrlQueue(href);
                                    }
                                }
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    //put in exception table with URL
                    ExceptionUrl except = new ClassLibrary1.ExceptionUrl(e.ToString(), url);

                    //add exception to table
                    TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(except);
                    StorageManager.getExceptionTable().Execute(insertOrReplaceOperation);
                }
            }
            else
            {
                return;
            }
        }