/// <summary> /// Reads robots.txt to get the url of the Sitemap Index file and returns it as a string. /// </summary> /// <param name="robotsUrl">Url of the Site's robots.txt file.</param> /// <returns>Url of the Sitemap Index file.</returns> private string getSitemapIndexURL(string robotsUrl) { string robotsFile; using (var web = new CompressedWebClient()) { robotsFile = web.DownloadString(robotsUrl); } string sitemapUrl; using (StringReader sr = new StringReader(robotsFile)) { string lineIn = sr.ReadLine(); while (lineIn != null && lineIn.StartsWith("Sitemap: ") == false) { lineIn = sr.ReadLine(); } if (lineIn == null) { throw new Exception("Sitelist is malformed!"); } else { sitemapUrl = lineIn.Substring("Sitemap: ".Length); robotsFile = null; } } return sitemapUrl; }
/// <summary> /// /// </summary> private void generateReviewList() { string[] sitemapUrlList = fileReader(sitemapUrlListPath); List<string> reviewUrlList = new List<string>(); foreach (string file in sitemapUrlList) { using (var web = new CompressedWebClient()) { string currentFile = web.DownloadString(file); string[] urlGroup = sortReviewUrls(currentFile); reviewUrlList.AddRange(urlGroup); } } sitemapUrlList = null; fileChunkWriter(reviewUrlListPath, reviewUrlList); }
/// <summary> /// Generates a list of XML files containing links to User reviews and writes them to a file. /// </summary> /// <param name="sitemapIndexUrl">Url of the Sitemap Index file.</param> private void generateSitemapList(string sitemapIndexUrl) { string sitemapBaseFile; using (var web = new CompressedWebClient()) { sitemapBaseFile = web.DownloadString(sitemapIndexUrl); } List<string> sitemapIndex = new List<string>(); using (XmlReader xr = XmlReader.Create(new StringReader(sitemapBaseFile))) { while (xr.ReadToFollowing("loc")) { string currentLine = xr.ReadElementContentAsString(); if (currentLine.StartsWith("http://www.tripadvisor.com.au/sitemap/en_AU/sitemap_sur_en_AU")) { sitemapIndex.Add(currentLine); } } } fileChunkWriter(sitemapUrlListPath, sitemapIndex); }