Exemplo n.º 1
0
        //reads through valid XML links and add their HTML links to the Url Queue
        public void readXMLUrl(string url)
        {
            settings.DtdProcessing = DtdProcessing.Parse;

            XmlReader reader = XmlReader.Create(url, settings);

            reader.MoveToContent();
            while (reader.Read())
            {
                //add performance with no changes in queue size, index size, or number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex);

                if (reader.NodeType == XmlNodeType.Element)
                {
                    if (reader.Name == "loc")
                    {
                        reader.Read();

                        if (reader.NodeType == XmlNodeType.Text && !reader.Value.Contains("xml"))
                        {
                            bool restricted = checkDisallow(reader.Value, Disallow);

                            //add html urls to list to be put into HTML queue
                            if (!restricted)
                            {
                                if (!this.Visited.Contains(reader.Value))
                                {
                                    if (!checkDisallow(reader.Value, this.Disallow))
                                    {
                                        if (!this.Visited.Contains(reader.Value))
                                        {
                                            addToUrlQueue(reader.Value);
                                        }
                                    }
                                }
                            }
                            else if ((reader.Value.Contains("xml") && (reader.Value.Contains("2018")) || reader.Value.Contains("nba.xml")))
                            {
                                //add xml urls to list to be put back into XML queue
                                this.XMLurls.Add(reader.Value);

                                CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value);
                                StorageManager.getXMLQueue().AddMessage(newXMLMessage);
                            }
                        }
                    }
                }
            }
        }
Exemplo n.º 2
0
        //reads through valid XML links and add their HTML links to the Url Queue
        public void readXMLUrl(string url)
        {
            settings.DtdProcessing = DtdProcessing.Parse;

            XmlReader reader = XmlReader.Create(url, settings);

            reader.MoveToContent();
            while (reader.Read())
            {
                Performance.insertPerformance("Loading");
                if (reader.NodeType == XmlNodeType.Element)
                {
                    if (reader.Name == "loc")
                    {
                        reader.Read();

                        if (reader.NodeType == XmlNodeType.Text)
                        {
                            if (reader.Value.Contains("html") || reader.Value.Contains("htm"))
                            {
                                bool restricted = checkDisallow(reader.Value, Disallow);

                                //add html urls to list to be put into HTML queue
                                if (!restricted)
                                {
                                    if (!this.Visited.Contains(reader.Value))
                                    {
                                        if (!checkDisallow(reader.Value, this.Disallow))
                                        {
                                            if (!this.Visited.Contains(reader.Value))
                                            {
                                                SizeCounter.SizeQueue++;

                                                addToUrlQueue(reader.Value);

                                                CloudQueueMessage message = new CloudQueueMessage(SizeCounter.SizeQueue.ToString());

                                                StorageManager.getNumQueue().AddMessage(message);


                                                Performance.insertPerformance("Crawling");
                                            }
                                        }
                                    }
                                }
                            }
                            else if (reader.Value.Contains("xml") && (reader.Value.Contains("2018") || reader.Value.Contains("2017/12") || reader.Value.Contains("2017-12")))
                            {
                                //add xml urls to list to be put back into XML queue
                                this.XMLurls.Add(reader.Value);


                                Performance.insertPerformance("Loading");


                                CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value);
                                StorageManager.getXMLQueue().AddMessage(newXMLMessage);
                            }
                        }
                    }
                }
            }
        }