private void readFromSitemapindex(System.Xml.XmlNode node, SitemapUrlLoaderInstanceData inst) { if (inst.CurrDepth >= inst.MaxSitemapDepth) { throw new Exception("Exceeded maximum sitemap depth."); } inst.CurrDepth++; foreach (System.Xml.XmlNode child in node.ChildNodes) { if (child.Name == "sitemap") { foreach (System.Xml.XmlNode loc in child.ChildNodes) { if (loc.Name == "loc") { mergeFromSitemapindex(getXMLDocumentFromUrl(new Uri(loc.InnerText), inst), inst); } } } } inst.CurrDepth--; }
private void mergeFromSitemapindex(System.Xml.XmlDocument doc, SitemapUrlLoaderInstanceData inst) { System.Xml.XmlElement listing = doc.DocumentElement; System.Xml.XmlNode inode; if (listing == null) { return; } if (listing.Name == "urlset") { if (inst.Doc == null) { inst.Doc = doc; } else { for (int i = 0; i < listing.ChildNodes.Count; i++) { inode = inst.Doc.ImportNode(listing.ChildNodes[i], true); inst.Doc.DocumentElement.AppendChild(inode); } } } else if (listing.Name == "sitemapindex") { readFromSitemapindex(listing, inst); } else { throw new Exception("Invalid sitemap."); } }
public string GetSitemapFromUrl(System.IO.Stream url) { System.IO.MemoryStream ms = new System.IO.MemoryStream(); url.CopyTo(ms); string retStr = ""; try { byte[] buff = ms.GetBuffer(); string urlStr = UTF8Encoding.UTF8.GetString(buff, 0, (int)ms.Length); SitemapUrlLoaderInstanceData instance = new SitemapUrlLoaderInstanceData(); System.Xml.XmlDocument doc; instance.AmountDownloaded = 0; instance.CurrDepth = 0; instance.NumRequests = 0; instance.Doc = null; doc = getXMLDocumentFromUrl(new Uri(urlStr), instance); mergeFromSitemapindex(doc, instance); if (instance.Doc == null) { throw new Exception("No sitemap in url."); } retStr = instance.Doc.InnerXml; } catch (Exception e) { WebOperationContext.Current.OutgoingResponse.StatusCode = System.Net.HttpStatusCode.InternalServerError; WebOperationContext.Current.OutgoingResponse.ContentType = "text/plain"; return("Error when retrieving sitemap: " + e.Message); } return(retStr); }
//Slightly modified from Sitemap filter private System.Xml.XmlDocument getXMLDocumentFromUrl(Uri url, SitemapUrlLoaderInstanceData inst) { if (url.IsFile || url.IsLoopback) { throw new Exception("Can't reference a file or a local path in URL."); } inst.NumRequests++; if (inst.NumRequests > inst.MaxRequests) { throw new Exception("Exceeded maximum number of web requests"); } System.Net.WebRequest req = System.Net.WebRequest.Create(url); System.Net.WebResponse res; System.IO.MemoryStream memstr = new System.IO.MemoryStream(); byte[] buff; req.Timeout = 10000; res = req.GetResponse(); System.IO.Stream rs = res.GetResponseStream(); long length = res.ContentLength; if ((length + inst.AmountDownloaded) > inst.MaxDownload) { throw new Exception("Exceeded maximum sitemap length."); } inst.AmountDownloaded += length; rs.CopyTo(memstr); buff = memstr.GetBuffer(); res.Close(); memstr.Close(); System.Xml.XmlDocument sitemap; if ((buff.Length > 2) && (buff[0] == 0x1f) && (buff[1] == 0x8b)) //Gzipped { memstr = new System.IO.MemoryStream(buff); sitemap = getDocFromGzipStream(memstr); if (sitemap == null) { new Exception("Internal server error."); } } else { sitemap = new System.Xml.XmlDocument(); string docStr = Encoding.UTF8.GetString(buff); int hasNull = docStr.IndexOf('\0'); if (hasNull < 0) { sitemap.LoadXml(docStr); } else { sitemap.LoadXml(docStr.Substring(0, hasNull)); } } return(sitemap); }