Пример #1
0
 private void OnWebPageLoaded(object sender, ChildPage page)
 {
     lock (childPages)
     {
         if (childPages.Any(val => val.url == page.url))
         {
             Console.WriteLine();
         }
         childPages.Add(page);
         waitForPages.Set();
     }
 }
Пример #2
0
        /*protected virtual void HandleResponse(HttpWebResponse response)
         * {
         *  string htmlString = DecompressHtml(response);
         *
         *  jobStatus = JobStatus.HandlingResponse;
         *  if (htmlString != string.Empty)
         *  {
         *      if (results.Any(obj => obj.Value.GetType() == typeof(TextUpdate)))
         *      {
         *          foreach (TextUpdate textUpdate in results.Values)
         *          {
         *              textUpdate.FilterByTags(htmlString);
         *          }
         *      }
         *
         *      if (results.Any(obj => obj.Value.GetType() == typeof(LinkFeed)))
         *      {
         *          MultiValueDictionary<string, ObjectId> links = new MultiValueDictionary<string, ObjectId>();
         *          foreach (LinkFeed feed in results.Values)
         *          {
         *              HashSet<string> filteredLinks = feed.FilterByTags(htmlString);
         *              foreach (string link in filteredLinks)
         *              {
         *                  links.Add(link, feed.recordid);
         *              }
         *          }
         *
         *          jobStatus = JobStatus.LoadingPages;
         *          LoadChildPages(links);
         *
         *          foreach (ChildPage page in childPages)
         *          {
         *              foreach (ObjectId jobId in page.jobIds)
         *              {
         *                  results[jobId].AddChildPage(page);
         *              }
         *          }
         *
         *          jobStatus = JobStatus.RankingPages;
         *          foreach (LinkFeed feed in results.Values)
         *          {
         *              feed.ProcessKeywordScores();
         *          }
         *      }
         *
         *      jobStatus = JobStatus.Finished;
         *      timeStamp = DateTime.UtcNow;
         *      WebCrawler.Instance.EnqueueResult(this);
         *  }
         * }*/

        private void LoadChildPages(MultiValueDictionary <string, ObjectId> links)
        {
            childPages   = new List <ChildPage>();
            waitForPages = new ManualResetEvent(false);

            foreach (KeyValuePair <string, IReadOnlyCollection <ObjectId> > pair in links)
            {
                ChildPage page = new ChildPage(pair.Key, DateTime.Now, pair.Value);

                page.WebPageLoaded += new EventHandler <ChildPage>(OnWebPageLoaded);
                page.LoadError     += new EventHandler(OnLoadError);
                WebCrawler.Instance.EnqueueWork(page);
            }

            while (childPages.Count != links.Count)
            {
                waitForPages.Reset();
                waitForPages.WaitOne();
            }
        }
Пример #3
0
        private void GetResponse(IAsyncResult webRequest)
        {
            HttpWebRequest  request  = (HttpWebRequest)webRequest.AsyncState;
            HttpWebResponse response = null;

            try
            {
                response = (HttpWebResponse)request.EndGetResponse(webRequest);
                //Console.WriteLine("\n\t\t\tLoading {0}", Domain.AbsoluteUri);
            }
            catch (WebException webEx)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(url);
                Console.ForegroundColor = ConsoleColor.Gray;
                Console.WriteLine(webEx.ToString());

                if (this is HtmlRecord)
                {
                    jobStatus = JobStatus.ErrorRequesting;
                }
                else
                {
                    ChildPage page = this as ChildPage;
                    page.InvokeLoadErrorEvent();
                }

                if (waitTime < WebCrawler.TimeoutPeriod)
                {
                    HttpWebResponse resp = webEx.Response as HttpWebResponse;
                    if (resp != null)
                    {
                        HttpStatusCode statuscode = resp.StatusCode;
                        switch (statuscode)
                        {
                        case (HttpStatusCode.Forbidden):
                            serverResponse = statuscode;
                            break;
                        //throw webEx;

                        case (HttpStatusCode.BadRequest):
                            break;

                        default:
                            SetWaitTime(waitTime + 10000);
                            WebCrawler.Instance.EnqueueWork(this);
                            break;
                        }
                    }
                    else
                    {
                        SetWaitTime(waitTime + 10000);
                        WebCrawler.Instance.EnqueueWork(this);
                    }
                }
                else
                {
                    if (this is ChildPage)
                    {
                        ChildPage page = this as ChildPage;
                        page.InvokeLoadedEvent();
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.ToString());
                throw ex;
            }
            finally
            {
                if (response != null)
                {
                    HandleResponse(response);
                }
            }
        }
Пример #4
0
 public void AddChildPage(ChildPage page)
 {
     childPages.Add(page);
 }