SpiderParseHTML, HeatonResearch.Spider C# (CSharp)代码示例

示例#1

0

显示文件

文件： LinkReport.cs 项目： Clever-Boy/jeffheaton-book-code

 /// <summary>
 /// Called when the spider is ready to process an HTML
 /// URL.
 /// </summary>
 /// <param name="url">The URL that the spider is about to process.</param>
 /// <param name="parse">An object that will allow you you to parse the HTML on this page.</param>
 public void SpiderProcessURL(Uri url, SpiderParseHTML parse)
 {
     try
     {
         parse.ReadAll();
     }
     catch (IOException)
     {
         spider.Logging.Log(Logger.Level.INFO, "Error reading page:" + url.ToString());
     }
 }

示例#2

0

显示文件

文件： Spider.cs 项目： Clever-Boy/jeffheaton-book-code

        /// <summary>
        /// This method is called by the thread pool to process one
        /// single URL.
        /// </summary>
        /// <param name="stateInfo">Not used.</param>
        private void SpiderWorkerProc(Object stateInfo)
        {
            Stream istream = null;
            WebRequest http;
            HttpWebResponse response;
            Uri url = null;
            try
            {
                url = (Uri)stateInfo;
                logging.Log(Logger.Level.INFO, "Processing: " + url);
                // Get the URL's contents.
                http = HttpWebRequest.Create(url);
                http.Timeout = this.options.Timeout;
                if (this.options.UserAgent != null)
                {
                    http.Headers["User-Agent"] = this.options.UserAgent;
                }
                response = (HttpWebResponse)http.GetResponse();


                // Read the URL.
                istream = response.GetResponseStream();

                // Parse the URL.
                if (String.Compare(response.ContentType, "text/html") == 0)
                {
                    SpiderParseHTML parse = new SpiderParseHTML(response.ResponseUri,
                        new SpiderInputStream(istream, null), this);
                    this.report.SpiderProcessURL(url, parse);
                }
                else
                {
                    this.report.SpiderProcessURL(url, istream);
                }

            }
            catch (IOException e)
            {
                logging.Log(Logger.Level.INFO, "I/O error on URL:" + url);
                try
                {
                    this.workloadManager.MarkError(url);
                }
                catch (WorkloadException)
                {
                    logging.Log(Logger.Level.ERROR, "Error marking workload(1).", e);
                }
                this.report.SpiderURLError(url);
                return;
            }
            catch (WebException e)
            {
                logging.Log(Logger.Level.INFO, "Web error on URL:" + url);
                try
                {
                    this.workloadManager.MarkError(url);
                }
                catch (WorkloadException)
                {
                    logging.Log(Logger.Level.ERROR, "Error marking workload(2).", e);
                }
                this.report.SpiderURLError(url);
                return;
            }
            catch (Exception e)
            {
                try
                {
                    this.workloadManager.MarkError(url);
                }
                catch (WorkloadException)
                {
                    logging.Log(Logger.Level.ERROR, "Error marking workload(3).", e);
                }

                logging.Log(Logger.Level.ERROR, "Caught exception at URL:" + url.ToString(), e);
                this.report.SpiderURLError(url);
                return;
            }
            finally
            {
                if (istream != null)
                {

                    istream.Close();

                }
            }

            try
            {
                // Mark URL as complete.
                this.workloadManager.MarkProcessed(url);
                logging.Log(Logger.Level.INFO, "Complete: " + url);
                if (!url.Equals(response.ResponseUri))
                {
                    // save the URL(for redirect's)
                    this.workloadManager.Add(response.ResponseUri, url,
                        this.workloadManager.GetDepth(response.ResponseUri));
                    this.workloadManager.MarkProcessed(response.ResponseUri);
                }
            }
            catch (WorkloadException e)
            {
                logging.Log(Logger.Level.ERROR, "Error marking workload(3).", e);
            }

        }

示例#3

0

显示文件

文件： WorldSpiderReport.cs 项目： Clever-Boy/jeffheaton-book-code

 /// <summary>
 /// Called when the spider is ready to process an HTML
 /// URL. Download the contents of the URL to a local file.
 /// </summary>
 /// <param name="url">The URL that the spider is about to process.</param>
 /// <param name="parse">An object that will allow you you to parse the HTML on this page.</param>
 public void SpiderProcessURL(Uri url, SpiderParseHTML parse)
 {
     String filename = URLUtility.convertFilename(this.path, url, true);
     Stream os = new FileStream(filename, FileMode.Create);
     parse.Stream.OutputStream = os;
     parse.ReadAll();
     os.Close();
 }

示例#4

0

显示文件

文件： Spider.cs 项目： zz454071940/jeffheaton-book-code

        /// <summary>
        /// This method is called by the thread pool to process one
        /// single URL.
        /// </summary>
        /// <param name="stateInfo">Not used.</param>
        private void SpiderWorkerProc(Object stateInfo)
        {
            Stream          istream = null;
            WebRequest      http;
            HttpWebResponse response;
            Uri             url = null;

            try
            {
                url = (Uri)stateInfo;
                logging.Log(Logger.Level.INFO, "Processing: " + url);
                // Get the URL's contents.
                http         = HttpWebRequest.Create(url);
                http.Timeout = this.options.Timeout;
                if (this.options.UserAgent != null)
                {
                    http.Headers["User-Agent"] = this.options.UserAgent;
                }
                response = (HttpWebResponse)http.GetResponse();


                // Read the URL.
                istream = response.GetResponseStream();

                // Parse the URL.
                if (String.Compare(response.ContentType, "text/html") == 0)
                {
                    SpiderParseHTML parse = new SpiderParseHTML(response.ResponseUri,
                                                                new SpiderInputStream(istream, null), this);
                    this.report.SpiderProcessURL(url, parse);
                }
                else
                {
                    this.report.SpiderProcessURL(url, istream);
                }
            }
            catch (IOException e)
            {
                logging.Log(Logger.Level.INFO, "I/O error on URL:" + url);
                try
                {
                    this.workloadManager.MarkError(url);
                }
                catch (WorkloadException)
                {
                    logging.Log(Logger.Level.ERROR, "Error marking workload(1).", e);
                }
                this.report.SpiderURLError(url);
                return;
            }
            catch (WebException e)
            {
                logging.Log(Logger.Level.INFO, "Web error on URL:" + url);
                try
                {
                    this.workloadManager.MarkError(url);
                }
                catch (WorkloadException)
                {
                    logging.Log(Logger.Level.ERROR, "Error marking workload(2).", e);
                }
                this.report.SpiderURLError(url);
                return;
            }
            catch (Exception e)
            {
                try
                {
                    this.workloadManager.MarkError(url);
                }
                catch (WorkloadException)
                {
                    logging.Log(Logger.Level.ERROR, "Error marking workload(3).", e);
                }

                logging.Log(Logger.Level.ERROR, "Caught exception at URL:" + url.ToString(), e);
                this.report.SpiderURLError(url);
                return;
            }
            finally
            {
                if (istream != null)
                {
                    istream.Close();
                }
            }

            try
            {
                // Mark URL as complete.
                this.workloadManager.MarkProcessed(url);
                logging.Log(Logger.Level.INFO, "Complete: " + url);
                if (!url.Equals(response.ResponseUri))
                {
                    // save the URL(for redirect's)
                    this.workloadManager.Add(response.ResponseUri, url,
                                             this.workloadManager.GetDepth(response.ResponseUri));
                    this.workloadManager.MarkProcessed(response.ResponseUri);
                }
            }
            catch (WorkloadException e)
            {
                logging.Log(Logger.Level.ERROR, "Error marking workload(3).", e);
            }
        }

C# (CSharp) HeatonResearch.Spider SpiderParseHTML示例