Пример #1
0
 private void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
 {
     if (e.CrawledPage.HttpWebResponse != null && e.CrawledPage.HttpWebResponse.StatusCode == HttpStatusCode.OK)
     {
         _crawlingStats.ProcessCrawledPage(e.CrawledPage);
     }
 }
Пример #2
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            string strPage=e.CrawledPage.Content.Text;

            HtmlNodeCollection aTags = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//h1");
            string name="";
            if(aTags!=null&&aTags.Count>0)
            {
                name = aTags[0].InnerText;

                Console.WriteLine(name);
            }

            HtmlNodeCollection aTable = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']");
            HtmlNodeCollection aTr = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']//tr");
            //int index = strPage.IndexOf("col_half col_last single-product");
            //int devEnd = strPage.IndexOf("</h1>", index);
            //string str = strPage.Substring(index, devEnd - index);
            //str = str.Substring(str.IndexOf("<h1>") + 4);

            if (aTr != null && aTr[1].InnerText.IndexOf("No") == -1 )
            {
                if( aTr[0].InnerText.IndexOf("Winning")==-1)
                {
                    string k = "#";

                }
            }

            //Console.WriteLine(e.CrawledPage.Content);
            //Process data
        }
Пример #3
0
        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
            PageCrawlCompletedArgs uut = new PageCrawlCompletedArgs(new CrawlContext(), page);

            Assert.AreSame(page, uut.CrawledPage);
        }
Пример #4
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            saveWebPage(e);

            if (e.CrawledPage.Uri.AbsoluteUri.Contains(DemoParameters.shouldSaveToPropertyTableParameter))
                saveProperty(e);
        }
Пример #5
0
        private void crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            PageResult pageResult = new PageResult();
            pageResult.Url = e.CrawledPage.Uri.AbsoluteUri;
            if(e.CrawledPage.HttpWebResponse != null)
                pageResult.HttpStatusCode = Convert.ToInt32(e.CrawledPage.HttpWebResponse.StatusCode);

            _actualCrawledPages.Add(pageResult);
        }
Пример #6
0
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            else
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
        }
Пример #7
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                SaveURLFail(crawledPage.Uri.AbsoluteUri.ToString());
            else
                SaveURLSuccess(crawledPage.Uri.AbsoluteUri.ToString());

            if (string.IsNullOrEmpty(crawledPage.RawContent))
                SaveURLNoContent(crawledPage.Uri.AbsoluteUri.ToString());
        }
Пример #8
0
 private static void OnPageCrawlCompleted(object p_Sender, PageCrawlCompletedArgs p_PageCrawlCompletedArgs)
 {
     CrawledPage page = p_PageCrawlCompletedArgs.CrawledPage;
     if (page.WebException != null) {
         Console.WriteLine("Crawl of page \"{0}\" failed{2}. Error: {1}",
             page.Uri.AbsoluteUri,
             page.WebException.Message,
             page.IsRetry ? String.Format(" (Retry #{0})", page.RetryCount) : "");
     } else if (page.HttpWebResponse.StatusCode != HttpStatusCode.OK) {
         Console.WriteLine("Crawl of page \"{0}\" failed{2}. HTTP Status Code: {1}",
             page.Uri.AbsoluteUri,
             page.HttpWebResponse.StatusCode,
             page.IsRetry ? String.Format(" (Retry #{0})", page.RetryCount) : "");
     } else {
         Console.WriteLine("Page crawl completed [{0}]", page.Uri.AbsoluteUri);
     }
 }
Пример #9
0
        public void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            backgroundWorker.ReportProgress(1);

            string strPage = e.CrawledPage.Content.Text;

            HtmlNodeCollection aTags = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//h1");
            string name = "", seller = "", buyer = "";
            if (aTags != null && aTags.Count > 0)
            {
                name = aTags[0].InnerText;

            }

            HtmlNodeCollection aTable = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']");
            HtmlNodeCollection aTr = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//table[@id='auction-bidhist']//tr");
            //int index = strPage.IndexOf("col_half col_last single-product");
            //int devEnd = strPage.IndexOf("</h1>", index);
            //string str = strPage.Substring(index, devEnd - index);
            //str = str.Substring(str.IndexOf("<h1>") + 4);

            HtmlNodeCollection sellerA = e.CrawledPage.HtmlDocument.DocumentNode.SelectNodes("//h4//a");
            if (sellerA != null && sellerA.Count > 0)
            {
                seller = sellerA[0].InnerText;
            }

            if (aTr != null && aTr[1].InnerText.IndexOf("No") == -1)
            {
                if (aTr[0].InnerText.IndexOf("Winning") == -1)
                {
                    HtmlNodeCollection abbTr = aTr[1].SelectNodes("//td//a");
                    buyer = abbTr[0].InnerText;
                }
            }

            if (name.Trim().Length > 0)
            {
                SetText(name, seller, buyer, e.CrawledPage.Uri.ToString());

            }

            //Console.WriteLine(e.CrawledPage.Content);
            //Process data
        }
Пример #10
0
 /// <summary>
 /// Asynchronous event that is fired when an individual page has been crawled.
 /// </summary>
 public static void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
 {
     if (e.CrawledPage.WebException != null || e.CrawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
         Console.WriteLine("Crawl of page failed: {0}\n", e.CrawledPage.Uri.AbsoluteUri);
     else if (string.IsNullOrEmpty(e.CrawledPage.RawContent))
     {
         Console.WriteLine("Page Crawled had no content: {0}\n", e.CrawledPage.Uri.AbsoluteUri);
     }
     else
     {
         String uri = e.CrawledPage.Uri.AbsoluteUri;
         //String date = new DateTime(e.CrawledPage.HttpWebResponse.Headers["Last-Modified"]);
         DateTime lastModified = e.CrawledPage.HttpWebResponse.LastModified;
         CrawledWebPage page = new CrawledWebPage(lastModified, e.CrawledPage.HtmlDocument.DocumentNode);
         _crawledPages.Add(uri, page);
         //Console.WriteLine("Crawled page saved: {0}", crawledPage.Uri.AbsoluteUri);
     }
 }
Пример #11
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (Regex.IsMatch(crawledPage.Uri.ToString(), @"http://www.tingchina.com/erge/(\d+)/play_(\d+)_(\d+)\.htm"))
            {

                var csQuery = crawledPage.CsQueryDocument.Find("iframe");
                foreach (var query in csQuery)
                {
                    if(query.Name=="playmedia")
                    {
                        var playSrc = query.GetAttribute("src");
                        var request = String.Format("http://{0}{1}", crawledPage.Uri.Host, playSrc);

                        var http = WebRequest.Create(request);
                        var response = http.GetResponse();
                        var stream = response.GetResponseStream();
                        var sr = new StreamReader(stream);
                        var content = sr.ReadToEnd();
                        stream.Dispose();
                        var realUrlIndex = content.IndexOf("url[3]=");
                        var realUrlIndex1 = content.IndexOf(";", realUrlIndex);

                        var realUrl = content.Substring(realUrlIndex, realUrlIndex1 - realUrlIndex);
                        Console.WriteLine("!!!!!!!!!!!!!!!!!!!!   " + realUrl + "   !!!!!!!!!!!!!!!!!!!!");
                    }
                }
            }

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            else
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
        }
Пример #12
0
 private void ThrowExceptionWhen_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
 {
     throw new Exception("Oh No!");
 }
Пример #13
0
        static void saveWebPage(PageCrawlCompletedArgs e)
        {
            // save to WebPage table

            //Process data
            var webpageContext = new WebPageDataContext(DemoParameters.connectionString);
            //var webpageContext = new WebPageDataContext();
            //IEnumerable<WebPage> wp = dbContext.WebPages.OrderBy(c => c.pageUrl);

            WebPage page = new WebPage
            {
                pageUrl = e.CrawledPage.Uri.ToString(),
                parentUrl = e.CrawledPage.ParentUri.ToString(),
                requestStartTime = e.CrawledPage.RequestStarted.ToString(),
                requestEndTime = e.CrawledPage.RequestCompleted.ToString(),
                downloadStartTime = e.CrawledPage.DownloadContentStarted.ToString(),
                downloadEndTime = e.CrawledPage.DownloadContentCompleted.ToString(),
                //pageHtml = e.CrawledPage.Content.Text
                pageHtml = ""
            };

            webpageContext.WebPages.InsertOnSubmit(page);

            try
            {
                webpageContext.SubmitChanges();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
                // Make some adjustments.
                // ...
                // Try again.
                webpageContext.SubmitChanges();
            }
            finally
            {
                if (webpageContext != null)
                    webpageContext.Dispose();
            }
        }
Пример #14
0
        static void saveProperty(PageCrawlCompletedArgs e)
        {
            // save to Property table
            var propertyContext = new PropertyDataContext(DemoParameters.connectionString);
            //var propertyContext = new PropertyDataContext();

            HtmlNode addressNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//span[@class='js-address']");
            string addr = "";
            if (addressNode != null) addr = addressNode.InnerText.Trim();

            HtmlNode priceNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='price']");
            string pric = "";
            if (priceNode != null) pric = priceNode.InnerText.Trim();

            HtmlNode propertytypeNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='propertytype']");
            string ptype = "";
            if (propertytypeNode != null) ptype = propertytypeNode.InnerText.Trim();

            HtmlNode saletypeNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='saleType']");
            string stype = "";
            if (saletypeNode != null) stype = saletypeNode.InnerText.Trim();

            HtmlNode saledateNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='saleDate']");
            string sdate = "";
            if (saledateNode != null) sdate = saledateNode.InnerText.Trim();

            HtmlNode landsizeNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//dd[@class='land']");
            string land = "";
            if (landsizeNode != null) land = landsizeNode.InnerText.Trim();

            HtmlNode featureNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//p[@class='features']");
            string feature = "";
            if (featureNode != null) feature = featureNode.InnerText.Trim();

            HtmlNode agentNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//ul[@class='cB-agentList']");
            string agentInfo = "";
            if (agentNode != null) agentInfo = agentNode.InnerText.Trim();

            HtmlNode schoolNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//div[@class='schoolData bdy collapsible collapsed']");
            string school = "";
            if (schoolNode != null) school = schoolNode.InnerText.Trim();

            HtmlNode descriptionNode = e.CrawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//div[@class='cT-productDescription']");
            string desc = "";
            if (descriptionNode != null) desc = descriptionNode.InnerText.Trim();

            Property p = new Property
            {
                pageUrl = e.CrawledPage.Uri.ToString(),
                address = addr,
                price = pric,
                propertyType = ptype,
                saleType = stype,
                saleDate = sdate,
                suburb = "",
                landSize = land,
                propertyFeature = feature,
                agents = agentInfo,
                schoolData = school,
                propertyDescription = desc
            };

            propertyContext.Properties.InsertOnSubmit(p);

            try
            {
                propertyContext.SubmitChanges();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
                // Make some adjustments.
                // ...
                // Try again.
                propertyContext.SubmitChanges();
            }
            finally
            {
                if (propertyContext != null)
                    propertyContext.Dispose();
            }
        }
Пример #15
0
 void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
 {
     try
     {
         SaveContent(e.CrawledPage);
     }
     catch (Exception ex)
     {
          LogHelper.Instance.WriteError(ex,GetType(),MethodBase.GetCurrentMethod().Name);
     }
     //Process data
 }
 private void PageCrawlCompletedEvent(object sender, PageCrawlCompletedArgs e)
 {
 }
Пример #17
0
 static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
 {
     //Process data
 }
Пример #18
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            bool externalLinksFound = false;
            _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}",
                                crawledPage.Uri.AbsoluteUri,
                                crawledPage.HttpWebResponse.StatusCode,
                                crawledPage.ParentUri.AbsoluteUri,
                                crawledPage.PageBag.CrawlerId,
                                crawledPage.PageBag.SessionId);

            //----------------------------------------
            // create and store the crawled link
            var crawledLink = new CrawledLink();
            crawledLink.SessionId = crawledPage.PageBag.SessionId;
            crawledLink.CrawlerId = crawledPage.PageBag.CrawlerId;
            crawledLink.SourceUrl = crawledPage.ParentUri.AbsoluteUri;
            crawledLink.TargetUrl = crawledPage.Uri.AbsoluteUri; // what was crawled
            crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode;
            crawledLink.IsRoot = crawledPage.IsRoot;
            crawledLink.CrawlDepth = crawledPage.CrawlDepth;

            //------------

            if (crawledPage.WebException != null)
            {
                // store error information if it occurred
                crawledLink.ErrorOccurred = true;
                crawledLink.Exception = crawledPage.WebException.Message; //TODO store more data of the exception

                _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}",
                                crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId),
                              crawledPage.WebException);
            }
            _scheduler.RecordCrawledLink(crawledLink);

            //----------------------------------------
            // Check if the page should be processed, if true process it 
            //  - extract the title, keywords, description, cookies, etc from the page
            //    and save processed data.
            if (crawledPage.WebException == null)
            {       
                if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode))
                {
                    using (var processor = _provider.GetInstanceOf<ICrawledPageProcessor>())
                    {
                        var result = processor.ProcessPage(crawledPage);
                        _repo.AddProcessedPage(result);
                    }
                }

                externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage);
                if (externalLinksFound)
                {
                    OnExternalLinksFound(CrawlerId, crawledPage.Uri);
                }
            }

            string mssg = null;
            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Error(mssg);
            }
            else
            {
                mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Debug(mssg);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                _logger.Error(mssg);
            }

            //------------

            OnLinkCrawlCompleted(CrawlerDefinition, 
                                 crawledPage.ParentUri.AbsoluteUri, 
                                 crawledPage.Uri.AbsoluteUri, 
                                 crawledPage.HttpWebResponse.StatusCode,
                                 crawledPage.WebException != null,
                                 externalLinksFound);
        }
Пример #19
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                Debug.WriteLine(string.Format("页面:{0}抓取失败,原因:{1}", crawledPage.Uri.AbsoluteUri, crawledPage.WebException.Message));

            string text = crawledPage.Content.Text;

            if (string.IsNullOrEmpty(text))
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            else if (text.Contains("活动八"))
            {

            }
            else
            {

                var c = Regex.Matches(text, "活动(一|二|三|四|五|六|七|八|九|十)(.|\n)*?活动时间(.|\n)*?<br");
                if (c.Count == 0)
                {
                    var a = Regex.Match(text, "thread_subject(.|\n)*?活动(.|\n){1,50}?</a");
                    if (a.Value != "")
                    {
                        var b = Regex.Match(text, "活动时间(.|\n)*?<br");
                        var time = Regex.Match(b.Value, "\\d{1,2}月\\d{1,2}(-|日)").Value;
                        Debug.Write(DateTime.Parse(time) + "   ");
                        Debug.WriteLine(Regex.Match(a.Value, "活动(.|\n)*?</a").Value.Replace("】", ":").Replace("<br", "").Replace("</a", ""));
                    }
                }

                List<string> list = new List<string>() { "古城探宝", "天降神剑" };

                foreach (Match item in c)
                {
                    string date = Regex.Match(item.Value, "\\d{1,2}月\\d{1,2}(-|日)").Value;
                    if (date != "" && !date.Contains("日"))
                    {
                        date = date.Replace("-", "日");
                    }
                    Debug.Write(DateTime.Parse(date) + "   ");
                    Debug.WriteLine(Regex.Match(item.Value, "活动(一|二|三|四|五|六|七|八|九|十)(.|\n)*?<br").Value.Replace("】", ":").Replace("<br", "").Replace("</a", ""));
                    var index = list.FindIndex(x => item.Value.Contains(x));
                    if (index != -1)
                    {
                        Activity activity = new Activity();
                        activity.Name = list[index];
                        activity.Date = DateTime.Parse(date);

                    }
                }

            }
        }
Пример #20
0
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            var crawledPageUri = crawledPage.Uri.ToString();
            //Crawling books
            if (Regex.IsMatch(crawledPageUri, @"http://www.tingchina.com/*.*/disp_(\d+).htm"))
            {
                int id = Int32.Parse(Regex.Match(crawledPageUri, @"(\d+)").Captures[0].Value);
                lock (booksLockObject)
                {
                    var book = new BookViewModel(id, crawledPage.Uri);
                    Books.Add(book);
                }
            }

            //Crawling book sections
            if (Regex.IsMatch(crawledPageUri, @"http://www.tingchina.com/*.*/(\d+)/play_(\d+)_(\d+)\.htm"))
            {
                var csQuery = crawledPage.CsQueryDocument.Find("iframe");
                foreach (var query in csQuery)
                {
                    if (query.Name == "playmedia")
                    {
                        var playSrc = query.GetAttribute("src");
                        var request = String.Format("http://{0}{1}", crawledPage.Uri.Host, playSrc);

                        lock (lockObject)
                        {
                            Sources.Add(new SoundPlayerSourceViewModel(request));
                        }
                    }
                }
            }

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            else
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
        }
Пример #21
0
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            if (e.CrawledPage.HttpWebResponse != null)
            {
                int status = (int)e.CrawledPage.HttpWebResponse.StatusCode;
                string url = e.CrawledPage.Uri.ToString();
                string parent = e.CrawledPage.ParentUri.ToString();

                lock (_lock)
                {
                    _logfile.WriteLine("Url {0} - status code {1} seen at {2}", url, status, parent);
                }
            }

            pagesDownloaded++;
            Console.Write("Pages downloaded {0}\r", pagesDownloaded);
            AddNewPage(e.CrawledPage);
        }