Example #1
0
 public void Save(PropertyBag propertyBag, XmlDocument data)
 {
     foreach (XmlElement e in data.DocumentElement.ChildNodes)
     {
         Save(propertyBag, e);
     }
 }
Example #2
0
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                BIDVObject item = new BIDVObject();
                //item.Id = Guid.NewGuid();
                //item.Url = propertyBag.Step.Uri.ToString();

                //if (item.Url.StartsWith("http://bidvportal.vn/eDocman"))
                //{
                //    item.Title = propertyBag.Title;

                //    string strTarget = t.GetMinimumString(propertyBag.Text, "Chi tiết văn bản", "Nội dung văn bản");
                //    item.Text = strTarget;

                //    string strNgayPhatHanh = t.GetMinimumString(strTarget, "Ngày phát hành", "Số đi");
                //    item.NgayPhatHanh = strNgayPhatHanh.Replace(' ','/');

                //    string strSubject = t.GetMinimumString(strTarget, "Trích yếu", "Độ khẩn");
                //    //item.Subject = strSubject;

                //    //item.ContentEncoding = propertyBag.ContentEncoding;
                //    //item.ContentType = propertyBag.ContentType;
                //    //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;
                //    item.Depth = propertyBag.Step.Depth;
                //    //item.CultureDisplayValue = cultureDisplayValue;

                //    string[] strSplit = { "/" };
                //    int day = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[0]);
                //    int month = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]);
                //    int year = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]);

                //    if ((DateTime.Now.Year == year) && (DateTime.Now.Month == month) && (DateTime.Now.Day == day))
                //    {
                //        //db.AddToItems(item);
                //    }
                //}
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                Console.WriteLine("=====================================================");
                Console.WriteLine(ex.Message);
            }
        }
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                BIDVObject item = new BIDVObject();
                item.OriginalUrl = propertyBag.Step.Uri.ToString();

                if (!IsDuplicate(item.OriginalUrl))
                {
                    item.Title = propertyBag.Title;
                    item.StatusDescription = propertyBag.StatusDescription;
                    item.ResponseUri = propertyBag.ResponseUri.ToString();
                    item.Text = propertyBag.Text;
                    item.Depth = propertyBag.Step.Depth;
                    item.LastModified = propertyBag.LastModified;
                    item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString();
                    item.Server = propertyBag.Server;
                    string description = t.GetBetween2Words("Chi tiết văn bản", "Xem toàn màn hình", item.Text.Replace("\r","  ").Replace("\n","  "));
                    item.Summary = t.RemoveWhiteSpace(description);

                    string strNgayPhatHanh = t.GetBetween2Words("Ngày phát hành", "Số đi", item.Summary);
                    strNgayPhatHanh = strNgayPhatHanh.Replace(' ', '/').Remove(0, ("Ngày phát hành").Length);
                    string[] strSplit = { "/" };
                    int day = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]);
                    int month = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]);
                    int year = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[3]);

                    //Clean the text field is null
                    item.Text = null;
                    item.IsToEmail = false;

                    db.AddToBIDVObjects(item);

                    item.ContentEncoding = propertyBag.ContentEncoding;
                    item.ContentType = propertyBag.ContentType;
                    //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;
                    //item.CultureDisplayValue = cultureDisplayValue;
                }
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }
Example #4
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     ITemplate template = SelectTemplate(propertyBag);
     if(template!=null)
     {
      XmlDocument resut=   template.Parse(propertyBag["HtmlDoc"].Value as HtmlAgilityPack.HtmlDocument);
         MongoDBSaver saver = new MongoDBSaver();
         saver.Save(propertyBag, resut);
     }
 }
Example #5
0
		/// <summary>
		/// Returns true to continue crawl of this url, else false
		/// </summary>
		/// <returns>True if this step should be cancelled, else false</returns>
		private bool OnAfterDownload(CrawlStep crawlStep, PropertyBag response)
		{
			EventHandler<AfterDownloadEventArgs> afterDownloadTmp = AfterDownload;
			if (afterDownloadTmp.IsNull())
			{
				return crawlStep.IsAllowed;
			}

			AfterDownloadEventArgs e =
				new AfterDownloadEventArgs(!crawlStep.IsAllowed, response);
			afterDownloadTmp(this, e);
			return !e.Cancel;
		}
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                ASPNETObject item = new ASPNETObject();
                item.OriginalUrl = propertyBag.Step.Uri.ToString();

                if (!IsDuplicate(item.OriginalUrl))
                {
                    item.Title = propertyBag.Title;
                    item.StatusDescription = propertyBag.StatusDescription;
                    item.ResponseUri = propertyBag.ResponseUri.ToString();
                    item.Text = null;
                    item.Depth = propertyBag.Step.Depth;
                    item.LastModified = propertyBag.LastModified;
                    item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString();
                    item.Server = propertyBag.Server;
                    //Clean the text field is null
                    db.AddToASPNETObjects(item);
                    item.ContentEncoding = propertyBag.ContentEncoding;
                    item.ContentType = propertyBag.ContentType;
                    item.IsToEmail = false;
                    item.Summary = propertyBag.Title;

                    //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;

                    //item.CultureDisplayValue = cultureDisplayValue;

                }
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }
Example #7
0
 protected virtual Template.ITemplate SelectTemplate(PropertyBag propertyBag)
 {
     if (Regex.IsMatch(propertyBag.ResponseUri.AbsoluteUri, "http://archive.cnblogs.com/a/.*"))
     {
         if (mytemplate == null)
         {
             XmlDocument doc = new XmlDocument();
             doc.Load("cnblogs-post-lightweight.xslt");
             mytemplate = new XSLTTemplate(doc);
         }
         return mytemplate;
     }else
     {
         return null;
     }
 }
Example #8
0
 private void Save(PropertyBag propertyBag, XmlElement data)
 {
     MongoCollection<BsonDocument> collection = this.db.GetCollection(data.Name);
     BsonDocument doc = new BsonDocument();
     foreach (XmlElement e in data.ChildNodes)
     {
         if(!string.IsNullOrEmpty(e.InnerXml))
         {
             doc.Add(e.Name, new BsonString(e.InnerXml));
         }
     }
     doc.Add("url", new BsonString(propertyBag.ResponseUri.AbsoluteUri));
     if (propertyBag.Referrer.Uri != null)
     {
         doc.Add("referrer", new BsonString(propertyBag.Referrer.Uri.AbsoluteUri));
     }
     Console.WriteLine(doc[0].AsString);
     collection.Insert(doc);
 }
Example #9
0
        /// <summary>
        /// </summary>
        /// <param name = "crawler">
        /// 	The crawler.
        /// </param>
        /// <param name = "propertyBag">
        /// 	The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            lock (this)
            {
                Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri);
                Console.Out.WriteLine(ConsoleColor.Blue, "stuff -> " + propertyBag.Text);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}",
                    propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", Thread.CurrentThread.ManagedThreadId);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse);
                Console.Out.WriteLine();

            }
        }
Example #10
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (!bindevents)
            {
                crawler.CrawlFinished += new EventHandler<CrawlFinishedEventArgs>(crawler_CrawlFinished);
                bindevents = true;
            }

            string id = config.GetDocumentPath(propertyBag.Step.Uri);

            if (propertyBag.StatusCode == System.Net.HttpStatusCode.OK)
            {
                repository.AddUpdate(id, propertyBag.Title, propertyBag.Text, propertyBag.LastModified);
                log.Info("Add/Update [" + id + "]");

            } else if (propertyBag.StatusCode == System.Net.HttpStatusCode.NotFound)
            {
                log.Warning("Crawler encoutered 404 for [" + id + "]");
                repository.Delete(id);
            } else
            {
                log.Warning(string.Format("Crawler encountered status {0} - {4} ({1}) for document {2} - {3}", propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id, propertyBag.Step.Uri, ((int)propertyBag.StatusCode).ToString()));
            }
        }
Example #11
0
		/// <summary>
		/// Executes OnProcessorException event
		/// </summary>
		private void OnProcessorException(PropertyBag propertyBag, Exception exception)
		{
			m_Logger.Error("Exception while processing pipeline for {0}, error was {1}", propertyBag.OriginalUrl, exception);
			PipelineException.ExecuteEvent(this, () => new PipelineExceptionEventArgs(propertyBag, exception));
		}
Example #12
0
            public void Process(Crawler crawler, PropertyBag propertyBag)
            {
                if (propertyBag.Step.Uri.PathAndQuery.ToLower().Contains("project"))
                {
                    string a = "";
                }
                if (!string.IsNullOrEmpty(propertyBag.Text) && !PreviouslyIndexed(propertyBag.Step.Uri.ToString()))
                {

                    Lucene.Net.Documents.Document doc = new
                    Lucene.Net.Documents.Document();

                    //add string properties
                    Lucene.Net.Documents.Field fldURL = new Lucene.Net.Documents.Field("url", propertyBag.Step.Uri.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES);
                    doc.Add(fldURL);
                    Lucene.Net.Documents.Field fldContent = new Lucene.Net.Documents.Field("content", propertyBag.Text, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES);
                    doc.Add(fldContent);
                    Lucene.Net.Documents.Field fldTitle = new Lucene.Net.Documents.Field("title", propertyBag.Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES);
                    doc.Add(fldTitle);

                    //write the document to the index
                    indexWriter.AddDocument(doc);
                }
            }
Example #13
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {


            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri);
            if (stepUri.Length > 396)
            {
                stepUri = stepUri.Substring(0, 396);
            }
            var crawlHistory = AspectF.Define.
               Return<CrawlHistory, NCrawlerEntitiesDbServices>(
                   e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault());

            if (crawlHistory == null)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri);
                });
                return;
            }
            try
            {
                if (propertyBag.StatusCode != HttpStatusCode.OK)
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }

                if (!IsHtmlContent(propertyBag.ContentType))
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }
                HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
                using (Stream reader = propertyBag.GetResponse())
                {
                    Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                    reader.Seek(0, SeekOrigin.Begin);
                    if (!documentEncoding.IsNull())
                    {
                        htmlDoc.Load(reader, documentEncoding, true);
                    }
                    else
                    {
                        htmlDoc.Load(reader, true);
                    }

                    //string content = reader.ReadToEnd();
                    //resultHtmlContent = content;
                }
                //string steplUri = propertyBag.ResponseUri.OriginalString;


                string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml;
                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                DocumentWithLinks links = htmlDoc.GetLinks();



                //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$";
                List<string> recipeRegex = null;
                var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
                if (jsonStr == null)
                {
                    using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
                    {
                        jsonStr = stream.ReadToEnd();
                        var policy = new CacheItemPolicy();
                        policy.Priority = CacheItemPriority.NotRemovable;
                        policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                        cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                        Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
                    }
                }
                var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr);
                if (json.RecipeRegex != null && json.RecipeRegex.Count > 0)
                {
                    recipeRegex = json.RecipeRegex;
                }
                bool needToStore = false;

                if (recipeRegex != null)
                {
                    foreach (var regex in recipeRegex)
                    {
                        if (Regex.IsMatch(propertyBag.Step.Uri.AbsoluteUri, regex, RegexOptions.IgnoreCase))
                        {
                            needToStore = true;
                            break;
                        }
                    }
                }
                else
                {
                    needToStore = true;
                }

                if (needToStore)
                {
                    //string folderPath = "D:/CrawlerManager/CrawlerData";
                    //string instanceFolderPath = folderPath + "/" + crawlHistory.GroupId;
                    //string path = folderPath + "/" + crawlHistory.GroupId + "/" + string.Format("{0}.txt", crawlHistory.Id);
                    //if (!Directory.Exists(folderPath))
                    //{
                    //    Directory.CreateDirectory(folderPath);
                    //}
                    //if (!Directory.Exists(instanceFolderPath))
                    //{
                    //    Directory.CreateDirectory(instanceFolderPath);
                    //}

                    //if (!File.Exists(path))
                    //{
                    //    try
                    //    {

                    //        using (StreamWriter sw = File.CreateText(path))
                    //        {
                    //            sw.WriteLine(orginalHtmlContent);
                    //        }

                    //    }
                    //    catch (Exception ex)
                    //    {
                    //        log4net.Config.XmlConfigurator.Configure();
                    //        log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                    //        log.Error(ex);
                    //    }
                    //}
                    var folderHelper = new FolderHelper();
                    var path = folderHelper.GetFolderPathToStore(crawlHistory.GroupId) + "/" + string.Format("{0}.txt", crawlHistory.Id);
                    Console.Write(path);

                    if (!File.Exists(path))
                    {
                        try
                        {
                            using (StreamWriter sw = File.CreateText(path))
                            {
                                sw.WriteLine(orginalHtmlContent);
                            }

                        }
                        catch (Exception ex)
                        {
                            log4net.Config.XmlConfigurator.Configure();
                            log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                            log.Error(ex);
                        }
                    }
                    //}
                }



                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                });

                foreach (string link in links.Links.Union(links.References))
                {
                    if (link.IsNullOrEmpty() || link.Length > 396)
                    {
                        continue;
                    }

                    string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                    string normalizedLink = "";
                    try
                    {
                        normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }
                    if (link.Contains("page="))
                    {
                        var a = 1;
                    }


                    crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                        propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, link},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
                }

            }
            catch (Exception ex)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                });
                log4net.Config.XmlConfigurator.Configure();
                log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                log.Error(ex);
            }
        }
Example #14
0
 private void ExecutePipeLineStep(IPipelineStep pipelineStep, PropertyBag propertyBag)
 {
     try
     {
         if (pipelineStep is IPipelineStepWithTimeout)
         {
             IPipelineStepWithTimeout stepWithTimeout = (IPipelineStepWithTimeout) pipelineStep;
             m_Logger.Debug("Running pipeline step {0} with timeout {1}",
                 pipelineStep.GetType().Name, stepWithTimeout.ProcessorTimeout);
             m_TaskRunner.RunSync(() => pipelineStep.Process(this, propertyBag), stepWithTimeout.ProcessorTimeout);
         }
         else
         {
             m_Logger.Debug("Running pipeline step {0}", pipelineStep.GetType().Name);
             pipelineStep.Process(this, propertyBag);
         }
     }
     catch (Exception ex)
     {
         OnProcessorException(propertyBag, ex);
     }
 }
Example #15
0
 /// <summary>
 /// Executes all the pipelines sequentially for each downloaded content
 /// in the crawl process. Used to extract data from content, like which
 /// url's to follow, email addresses, aso.
 /// </summary>
 /// <param name="propertyBag">Downloaded content</param>
 private void ExecutePipeLine(PropertyBag propertyBag)
 {
     Pipeline.ForEach(pipelineStep => ExecutePipeLineStep(pipelineStep, propertyBag));
 }
Example #16
0
        private void ExecutePipeLineStep(IPipelineStep pipelineStep, PropertyBag propertyBag)
        {
            try
            {
                Stopwatch sw = Stopwatch.StartNew();
                m_Logger.Debug("Executing pipeline step {0}", pipelineStep.GetType().Name);
                if (pipelineStep is IPipelineStepWithTimeout)
                {
                    IPipelineStepWithTimeout stepWithTimeout = (IPipelineStepWithTimeout) pipelineStep;
                    m_Logger.Debug("Running pipeline step {0} with timeout {1}",
                        pipelineStep.GetType().Name, stepWithTimeout.ProcessorTimeout);
                    m_TaskRunner.RunSync(cancelArgs =>
                        {
                            if (!cancelArgs.Cancel)
                            {
                                pipelineStep.Process(this, propertyBag);
                            }
                        }, stepWithTimeout.ProcessorTimeout);
                }
                else
                {
                    pipelineStep.Process(this, propertyBag);
                }

                m_Logger.Debug("Executed pipeline step {0} in {1}", pipelineStep.GetType().Name, sw.Elapsed);
            }
            catch (Exception ex)
            {
                OnProcessorException(propertyBag, ex);
            }
        }
Example #17
0
    public void Process(Crawler crawler, PropertyBag propertyBag)
    {
        string text = propertyBag.Text;

            if (text.IsNullOrEmpty())
            {
                return;
            }

            if (text.IndexOf(ContentToBeFound) != -1)
            {
                paginas.Add(new Link(propertyBag.Step.Uri.ToString(),propertyBag.Step.Uri.ToString(), propertyBag.Text.Remove(100)));
            }
    }
Example #18
0
        private void EndDownload(CrawlStep crawlStep, PropertyBag propertyBag, Exception exception,
            ThreadSafeCounter.ThreadSafeCounterCookie counterCookie)
        {
            using (counterCookie)
            {
                if (exception != null)
                {
                    OnDownloadException(exception, crawlStep);
                } else if (!propertyBag.IsNull())
                {
                    propertyBag.Referrer = crawlStep;

                    // Assign initial properties to propertybag
                    if (!counterCookie.CrawlerQueueEntry.Properties.IsNull())
                    {
                        counterCookie.CrawlerQueueEntry.Properties.
                            ForEach(key => propertyBag[key.Key].Value = key.Value);
                    }

                    if (OnAfterDownload(crawlStep, propertyBag))
                    {
                        // Executes all the pipelines sequentially for each downloaded content
                        // in the crawl process. Used to extract data from content, like which
                        // url's to follow, email addresses, aso.
                        Pipeline.ForEach(pipelineStep => ExecutePipeLineStep(pipelineStep, propertyBag));
                    }
                }
            }

            ProcessQueue();
        }
			public void Crawl(Uri uri, PropertyBag referer)
			{
				int depth = referer?.Step?.Depth + 1 ?? 0;
				_transformBlock.Post(new PropertyBag
				{
					Step = new CrawlStep(uri, depth),
					Referrer = referer?.Referrer,
					UserAgent = _userAgent
				});
			}
Example #20
0
            /// <summary>
            /// </summary>
            /// <param name="crawler">
            /// The crawler.
            /// </param>
            /// <param name="propertyBag">
            /// The property bag.
            /// </param>
            public void Process(Crawler crawler, PropertyBag propertyBag)
            {
                //CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
                //string cultureDisplayValue = "N/A";
                //if (!contentCulture.IsNull())
                //{
                //    cultureDisplayValue = contentCulture.DisplayName;
                //}

                lock (this)
                {
                    //EchoControl.Invoke(new ShowTitleDelegate(ShowTitle), propertyBag.Title);
                    InvokeOneWorkFinished(propertyBag.Title);
                    //Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", System.Threading.Thread.CurrentThread.ManagedThreadId);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse);
                    //Console.Out.WriteLine();
                }
            }
Example #21
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     Console.Out.WriteLine(propertyBag.Step.Uri);
 }
Example #22
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     foreach (Uri uri in seeds)
         {
             crawler.AddStep(uri, 2);
         }
 }
		public Task RunAsync()
		{
			TransformBlock<Uri, PropertyBag> ingestBlock = new TransformBlock<Uri, PropertyBag>(input =>
			{
				PropertyBag result = new PropertyBag
				{
					OriginalUrl = input.ToString(),
					UserAgent = _userAgent,
					Step = new CrawlStep(input, 0)
				};

				return result;
			}, new ExecutionDataflowBlockOptions
			{
				MaxDegreeOfParallelism = MaxDegreeOfParallelism
			});

			TransformBlock<PropertyBag, PropertyBag> ingestBlockForAggregation =
				new TransformBlock<PropertyBag, PropertyBag>(input => input, new ExecutionDataflowBlockOptions
				{
					MaxDegreeOfParallelism = MaxDegreeOfParallelism
				});

			CrawlIngestionHelper crawlIngestionHelper = new CrawlIngestionHelper(ingestBlockForAggregation, _userAgent);
			TransformBlock<PropertyBag, PropertyBag>[] pipeline = Pipeline
				.Select(pipelineStep =>
				{
					return new TransformBlock<PropertyBag, PropertyBag>(async propertyBag =>
					{
						if (propertyBag.StopPipelining)
						{
							return propertyBag;
						}

						try
						{
							propertyBag.StopPipelining = !await pipelineStep.Process(crawlIngestionHelper, propertyBag);
						}
						catch (Exception exception)
						{
							propertyBag.Exceptions.Add(exception);
						}

						return propertyBag;
					}, new ExecutionDataflowBlockOptions
					{
						MaxDegreeOfParallelism = pipelineStep.MaxDegreeOfParallelism
					});
				})
				.ToArray();

			ActionBlock<PropertyBag> terminationCheckerBlock = new ActionBlock<PropertyBag>(propertyBag =>
			{
				if (ingestBlock.InputCount == 0
					&& ingestBlock.OutputCount == 0
					&& !ingestBlock.Completion.IsCompleted
					&& !ingestBlock.Completion.IsCanceled
					&& !ingestBlock.Completion.IsFaulted
					&& ingestBlockForAggregation.InputCount == 0
					&& ingestBlockForAggregation.OutputCount == 0)
				{
					if (pipeline.Any(transformBlock => transformBlock.InputCount != 0 || transformBlock.OutputCount != 0))
					{
						return;
					}

					ingestBlock.Complete();
				}
			}, new ExecutionDataflowBlockOptions {MaxDegreeOfParallelism = 1});

			ingestBlock.LinkTo(ingestBlockForAggregation, new DataflowLinkOptions {PropagateCompletion = true});
			TransformBlock<PropertyBag, PropertyBag> previous = ingestBlockForAggregation;
			foreach (TransformBlock<PropertyBag, PropertyBag> transformBlock in pipeline)
			{
				previous.LinkTo(transformBlock, new DataflowLinkOptions {PropagateCompletion = true});
				previous = transformBlock;
			}

			previous.LinkTo(terminationCheckerBlock, new DataflowLinkOptions {PropagateCompletion = true});
			foreach (Uri startUri in StartUris)
			{
				ingestBlock.Post(startUri);
			}

			return terminationCheckerBlock.Completion;
		}
Example #24
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     HttpContext.Current.Response.Write("<br>FindUrl:" + propertyBag.Step.Uri);
 }