//reads through valid XML links and add their HTML links to the Url Queue public void readXMLUrl(string url) { settings.DtdProcessing = DtdProcessing.Parse; XmlReader reader = XmlReader.Create(url, settings); reader.MoveToContent(); while (reader.Read()) { //add performance with no changes in queue size, index size, or number crawled var crawled = 0; var sizeQueue = 0; var sizeIndex = 0; TableQuery <Performance> query3 = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue; sizeIndex = item.SizeIndex; } Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex); if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "loc") { reader.Read(); if (reader.NodeType == XmlNodeType.Text && !reader.Value.Contains("xml")) { bool restricted = checkDisallow(reader.Value, Disallow); //add html urls to list to be put into HTML queue if (!restricted) { if (!this.Visited.Contains(reader.Value)) { if (!checkDisallow(reader.Value, this.Disallow)) { if (!this.Visited.Contains(reader.Value)) { addToUrlQueue(reader.Value); } } } } else if ((reader.Value.Contains("xml") && (reader.Value.Contains("2018")) || reader.Value.Contains("nba.xml"))) { //add xml urls to list to be put back into XML queue this.XMLurls.Add(reader.Value); CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value); StorageManager.getXMLQueue().AddMessage(newXMLMessage); } } } } } }
//reads through valid XML links and add their HTML links to the Url Queue public void readXMLUrl(string url) { settings.DtdProcessing = DtdProcessing.Parse; XmlReader reader = XmlReader.Create(url, settings); reader.MoveToContent(); while (reader.Read()) { Performance.insertPerformance("Loading"); if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "loc") { reader.Read(); if (reader.NodeType == XmlNodeType.Text) { if (reader.Value.Contains("html") || reader.Value.Contains("htm")) { bool restricted = checkDisallow(reader.Value, Disallow); //add html urls to list to be put into HTML queue if (!restricted) { if (!this.Visited.Contains(reader.Value)) { if (!checkDisallow(reader.Value, this.Disallow)) { if (!this.Visited.Contains(reader.Value)) { SizeCounter.SizeQueue++; addToUrlQueue(reader.Value); CloudQueueMessage message = new CloudQueueMessage(SizeCounter.SizeQueue.ToString()); StorageManager.getNumQueue().AddMessage(message); Performance.insertPerformance("Crawling"); } } } } } else if (reader.Value.Contains("xml") && (reader.Value.Contains("2018") || reader.Value.Contains("2017/12") || reader.Value.Contains("2017-12"))) { //add xml urls to list to be put back into XML queue this.XMLurls.Add(reader.Value); Performance.insertPerformance("Loading"); CloudQueueMessage newXMLMessage = new CloudQueueMessage(reader.Value); StorageManager.getXMLQueue().AddMessage(newXMLMessage); } } } } } }