Пример #1
0
        /// <summary>
        /// Reads robots.txt to get the url of the Sitemap Index file and returns it as a string.
        /// </summary>
        /// <param name="robotsUrl">Url of the Site's robots.txt file.</param>
        /// <returns>Url of the Sitemap Index file.</returns>
        private string getSitemapIndexURL(string robotsUrl)
        {
            string robotsFile;
            using (var web = new CompressedWebClient())
            {
                robotsFile = web.DownloadString(robotsUrl);
            }

            string sitemapUrl;
            using (StringReader sr = new StringReader(robotsFile))
            {
                string lineIn = sr.ReadLine();
                while (lineIn != null && lineIn.StartsWith("Sitemap: ") == false)
                {
                    lineIn = sr.ReadLine();
                }
                if (lineIn == null) { throw new Exception("Sitelist is malformed!"); }
                else
                {
                    sitemapUrl = lineIn.Substring("Sitemap: ".Length);
                    robotsFile = null;
                }
            }
            return sitemapUrl;
        }
Пример #2
0
        /// <summary>
        /// 
        /// </summary>
        private void generateReviewList()
        {
            string[] sitemapUrlList = fileReader(sitemapUrlListPath);

            List<string> reviewUrlList = new List<string>();
            foreach (string file in sitemapUrlList)
            {
                using (var web = new CompressedWebClient())
                {
                    string currentFile = web.DownloadString(file);
                    string[] urlGroup = sortReviewUrls(currentFile);
                    reviewUrlList.AddRange(urlGroup);
                }
            }
            sitemapUrlList = null;

            fileChunkWriter(reviewUrlListPath, reviewUrlList);
        }
Пример #3
0
        /// <summary>
        /// Generates a list of XML files containing links to User reviews and writes them to a file.
        /// </summary>
        /// <param name="sitemapIndexUrl">Url of the Sitemap Index file.</param>
        private void generateSitemapList(string sitemapIndexUrl)
        {
            string sitemapBaseFile;
            using (var web = new CompressedWebClient())
            {
                sitemapBaseFile = web.DownloadString(sitemapIndexUrl);
            }

            List<string> sitemapIndex = new List<string>();
            using (XmlReader xr = XmlReader.Create(new StringReader(sitemapBaseFile)))
            {
                while (xr.ReadToFollowing("loc"))
                {
                    string currentLine = xr.ReadElementContentAsString();
                    if (currentLine.StartsWith("http://www.tripadvisor.com.au/sitemap/en_AU/sitemap_sur_en_AU"))
                    {
                        sitemapIndex.Add(currentLine);
                    }
                }
            }

            fileChunkWriter(sitemapUrlListPath, sitemapIndex);
        }