public void Save(PropertyBag propertyBag, XmlDocument data) { foreach (XmlElement e in data.DocumentElement.ChildNodes) { Save(propertyBag, e); } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { BIDVObject item = new BIDVObject(); //item.Id = Guid.NewGuid(); //item.Url = propertyBag.Step.Uri.ToString(); //if (item.Url.StartsWith("http://bidvportal.vn/eDocman")) //{ // item.Title = propertyBag.Title; // string strTarget = t.GetMinimumString(propertyBag.Text, "Chi tiết văn bản", "Nội dung văn bản"); // item.Text = strTarget; // string strNgayPhatHanh = t.GetMinimumString(strTarget, "Ngày phát hành", "Số đi"); // item.NgayPhatHanh = strNgayPhatHanh.Replace(' ','/'); // string strSubject = t.GetMinimumString(strTarget, "Trích yếu", "Độ khẩn"); // //item.Subject = strSubject; // //item.ContentEncoding = propertyBag.ContentEncoding; // //item.ContentType = propertyBag.ContentType; // //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; // item.Depth = propertyBag.Step.Depth; // //item.CultureDisplayValue = cultureDisplayValue; // string[] strSplit = { "/" }; // int day = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[0]); // int month = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]); // int year = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]); // if ((DateTime.Now.Year == year) && (DateTime.Now.Month == month) && (DateTime.Now.Day == day)) // { // //db.AddToItems(item); // } //} } try { db.SaveChanges(); } catch (Exception ex) { Console.WriteLine("====================================================="); Console.WriteLine(ex.Message); } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { BIDVObject item = new BIDVObject(); item.OriginalUrl = propertyBag.Step.Uri.ToString(); if (!IsDuplicate(item.OriginalUrl)) { item.Title = propertyBag.Title; item.StatusDescription = propertyBag.StatusDescription; item.ResponseUri = propertyBag.ResponseUri.ToString(); item.Text = propertyBag.Text; item.Depth = propertyBag.Step.Depth; item.LastModified = propertyBag.LastModified; item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString(); item.Server = propertyBag.Server; string description = t.GetBetween2Words("Chi tiết văn bản", "Xem toàn màn hình", item.Text.Replace("\r"," ").Replace("\n"," ")); item.Summary = t.RemoveWhiteSpace(description); string strNgayPhatHanh = t.GetBetween2Words("Ngày phát hành", "Số đi", item.Summary); strNgayPhatHanh = strNgayPhatHanh.Replace(' ', '/').Remove(0, ("Ngày phát hành").Length); string[] strSplit = { "/" }; int day = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]); int month = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]); int year = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[3]); //Clean the text field is null item.Text = null; item.IsToEmail = false; db.AddToBIDVObjects(item); item.ContentEncoding = propertyBag.ContentEncoding; item.ContentType = propertyBag.ContentType; //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; //item.CultureDisplayValue = cultureDisplayValue; } } try { db.SaveChanges(); } catch (Exception ex) { throw new Exception(ex.Message); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { ITemplate template = SelectTemplate(propertyBag); if(template!=null) { XmlDocument resut= template.Parse(propertyBag["HtmlDoc"].Value as HtmlAgilityPack.HtmlDocument); MongoDBSaver saver = new MongoDBSaver(); saver.Save(propertyBag, resut); } }
/// <summary> /// Returns true to continue crawl of this url, else false /// </summary> /// <returns>True if this step should be cancelled, else false</returns> private bool OnAfterDownload(CrawlStep crawlStep, PropertyBag response) { EventHandler<AfterDownloadEventArgs> afterDownloadTmp = AfterDownload; if (afterDownloadTmp.IsNull()) { return crawlStep.IsAllowed; } AfterDownloadEventArgs e = new AfterDownloadEventArgs(!crawlStep.IsAllowed, response); afterDownloadTmp(this, e); return !e.Cancel; }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { ASPNETObject item = new ASPNETObject(); item.OriginalUrl = propertyBag.Step.Uri.ToString(); if (!IsDuplicate(item.OriginalUrl)) { item.Title = propertyBag.Title; item.StatusDescription = propertyBag.StatusDescription; item.ResponseUri = propertyBag.ResponseUri.ToString(); item.Text = null; item.Depth = propertyBag.Step.Depth; item.LastModified = propertyBag.LastModified; item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString(); item.Server = propertyBag.Server; //Clean the text field is null db.AddToASPNETObjects(item); item.ContentEncoding = propertyBag.ContentEncoding; item.ContentType = propertyBag.ContentType; item.IsToEmail = false; item.Summary = propertyBag.Title; //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; //item.CultureDisplayValue = cultureDisplayValue; } } try { db.SaveChanges(); } catch (Exception ex) { throw new Exception(ex.Message); } }
protected virtual Template.ITemplate SelectTemplate(PropertyBag propertyBag) { if (Regex.IsMatch(propertyBag.ResponseUri.AbsoluteUri, "http://archive.cnblogs.com/a/.*")) { if (mytemplate == null) { XmlDocument doc = new XmlDocument(); doc.Load("cnblogs-post-lightweight.xslt"); mytemplate = new XSLTTemplate(doc); } return mytemplate; }else { return null; } }
private void Save(PropertyBag propertyBag, XmlElement data) { MongoCollection<BsonDocument> collection = this.db.GetCollection(data.Name); BsonDocument doc = new BsonDocument(); foreach (XmlElement e in data.ChildNodes) { if(!string.IsNullOrEmpty(e.InnerXml)) { doc.Add(e.Name, new BsonString(e.InnerXml)); } } doc.Add("url", new BsonString(propertyBag.ResponseUri.AbsoluteUri)); if (propertyBag.Referrer.Uri != null) { doc.Add("referrer", new BsonString(propertyBag.Referrer.Uri.AbsoluteUri)); } Console.WriteLine(doc[0].AsString); collection.Insert(doc); }
/// <summary> /// </summary> /// <param name = "crawler"> /// The crawler. /// </param> /// <param name = "propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } lock (this) { Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri); Console.Out.WriteLine(ConsoleColor.Blue, "stuff -> " + propertyBag.Text); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", Thread.CurrentThread.ManagedThreadId); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse); Console.Out.WriteLine(); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (!bindevents) { crawler.CrawlFinished += new EventHandler<CrawlFinishedEventArgs>(crawler_CrawlFinished); bindevents = true; } string id = config.GetDocumentPath(propertyBag.Step.Uri); if (propertyBag.StatusCode == System.Net.HttpStatusCode.OK) { repository.AddUpdate(id, propertyBag.Title, propertyBag.Text, propertyBag.LastModified); log.Info("Add/Update [" + id + "]"); } else if (propertyBag.StatusCode == System.Net.HttpStatusCode.NotFound) { log.Warning("Crawler encoutered 404 for [" + id + "]"); repository.Delete(id); } else { log.Warning(string.Format("Crawler encountered status {0} - {4} ({1}) for document {2} - {3}", propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id, propertyBag.Step.Uri, ((int)propertyBag.StatusCode).ToString())); } }
/// <summary> /// Executes OnProcessorException event /// </summary> private void OnProcessorException(PropertyBag propertyBag, Exception exception) { m_Logger.Error("Exception while processing pipeline for {0}, error was {1}", propertyBag.OriginalUrl, exception); PipelineException.ExecuteEvent(this, () => new PipelineExceptionEventArgs(propertyBag, exception)); }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.Step.Uri.PathAndQuery.ToLower().Contains("project")) { string a = ""; } if (!string.IsNullOrEmpty(propertyBag.Text) && !PreviouslyIndexed(propertyBag.Step.Uri.ToString())) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); //add string properties Lucene.Net.Documents.Field fldURL = new Lucene.Net.Documents.Field("url", propertyBag.Step.Uri.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES); doc.Add(fldURL); Lucene.Net.Documents.Field fldContent = new Lucene.Net.Documents.Field("content", propertyBag.Text, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES); doc.Add(fldContent); Lucene.Net.Documents.Field fldTitle = new Lucene.Net.Documents.Field("title", propertyBag.Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES); doc.Add(fldTitle); //write the document to the index indexWriter.AddDocument(doc); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri); if (stepUri.Length > 396) { stepUri = stepUri.Substring(0, 396); } var crawlHistory = AspectF.Define. Return<CrawlHistory, NCrawlerEntitiesDbServices>( e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault()); if (crawlHistory == null) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri); }); return; } try { if (propertyBag.StatusCode != HttpStatusCode.OK) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key); //if (!result.IsNull()) //{ // e.DeleteObject(result); // e.SaveChanges(); //} }); return; } if (!IsHtmlContent(propertyBag.ContentType)) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key); //if (!result.IsNull()) //{ // e.DeleteObject(result); // e.SaveChanges(); //} }); return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (Stream reader = propertyBag.GetResponse()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } //string content = reader.ReadToEnd(); //resultHtmlContent = content; } //string steplUri = propertyBag.ResponseUri.OriginalString; string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml; string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); DocumentWithLinks links = htmlDoc.GetLinks(); //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$"; List<string> recipeRegex = null; var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string; if (jsonStr == null) { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } } var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr); if (json.RecipeRegex != null && json.RecipeRegex.Count > 0) { recipeRegex = json.RecipeRegex; } bool needToStore = false; if (recipeRegex != null) { foreach (var regex in recipeRegex) { if (Regex.IsMatch(propertyBag.Step.Uri.AbsoluteUri, regex, RegexOptions.IgnoreCase)) { needToStore = true; break; } } } else { needToStore = true; } if (needToStore) { //string folderPath = "D:/CrawlerManager/CrawlerData"; //string instanceFolderPath = folderPath + "/" + crawlHistory.GroupId; //string path = folderPath + "/" + crawlHistory.GroupId + "/" + string.Format("{0}.txt", crawlHistory.Id); //if (!Directory.Exists(folderPath)) //{ // Directory.CreateDirectory(folderPath); //} //if (!Directory.Exists(instanceFolderPath)) //{ // Directory.CreateDirectory(instanceFolderPath); //} //if (!File.Exists(path)) //{ // try // { // using (StreamWriter sw = File.CreateText(path)) // { // sw.WriteLine(orginalHtmlContent); // } // } // catch (Exception ex) // { // log4net.Config.XmlConfigurator.Configure(); // log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); // log.Error(ex); // } //} var folderHelper = new FolderHelper(); var path = folderHelper.GetFolderPathToStore(crawlHistory.GroupId) + "/" + string.Format("{0}.txt", crawlHistory.Id); Console.Write(path); if (!File.Exists(path)) { try { using (StreamWriter sw = File.CreateText(path)) { sw.WriteLine(orginalHtmlContent); } } catch (Exception ex) { log4net.Config.XmlConfigurator.Configure(); log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); log.Error(ex); } } //} } AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); }); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty() || link.Length > 396) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = ""; try { normalizedLink = NormalizeLink(baseUrl, decodedLink); } catch (Exception ex) { continue; } if (normalizedLink.IsNullOrEmpty()) { continue; } if (link.Contains("page=")) { var a = 1; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, link}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } } catch (Exception ex) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); }); log4net.Config.XmlConfigurator.Configure(); log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); log.Error(ex); } }
private void ExecutePipeLineStep(IPipelineStep pipelineStep, PropertyBag propertyBag) { try { if (pipelineStep is IPipelineStepWithTimeout) { IPipelineStepWithTimeout stepWithTimeout = (IPipelineStepWithTimeout) pipelineStep; m_Logger.Debug("Running pipeline step {0} with timeout {1}", pipelineStep.GetType().Name, stepWithTimeout.ProcessorTimeout); m_TaskRunner.RunSync(() => pipelineStep.Process(this, propertyBag), stepWithTimeout.ProcessorTimeout); } else { m_Logger.Debug("Running pipeline step {0}", pipelineStep.GetType().Name); pipelineStep.Process(this, propertyBag); } } catch (Exception ex) { OnProcessorException(propertyBag, ex); } }
/// <summary> /// Executes all the pipelines sequentially for each downloaded content /// in the crawl process. Used to extract data from content, like which /// url's to follow, email addresses, aso. /// </summary> /// <param name="propertyBag">Downloaded content</param> private void ExecutePipeLine(PropertyBag propertyBag) { Pipeline.ForEach(pipelineStep => ExecutePipeLineStep(pipelineStep, propertyBag)); }
private void ExecutePipeLineStep(IPipelineStep pipelineStep, PropertyBag propertyBag) { try { Stopwatch sw = Stopwatch.StartNew(); m_Logger.Debug("Executing pipeline step {0}", pipelineStep.GetType().Name); if (pipelineStep is IPipelineStepWithTimeout) { IPipelineStepWithTimeout stepWithTimeout = (IPipelineStepWithTimeout) pipelineStep; m_Logger.Debug("Running pipeline step {0} with timeout {1}", pipelineStep.GetType().Name, stepWithTimeout.ProcessorTimeout); m_TaskRunner.RunSync(cancelArgs => { if (!cancelArgs.Cancel) { pipelineStep.Process(this, propertyBag); } }, stepWithTimeout.ProcessorTimeout); } else { pipelineStep.Process(this, propertyBag); } m_Logger.Debug("Executed pipeline step {0} in {1}", pipelineStep.GetType().Name, sw.Elapsed); } catch (Exception ex) { OnProcessorException(propertyBag, ex); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { string text = propertyBag.Text; if (text.IsNullOrEmpty()) { return; } if (text.IndexOf(ContentToBeFound) != -1) { paginas.Add(new Link(propertyBag.Step.Uri.ToString(),propertyBag.Step.Uri.ToString(), propertyBag.Text.Remove(100))); } }
private void EndDownload(CrawlStep crawlStep, PropertyBag propertyBag, Exception exception, ThreadSafeCounter.ThreadSafeCounterCookie counterCookie) { using (counterCookie) { if (exception != null) { OnDownloadException(exception, crawlStep); } else if (!propertyBag.IsNull()) { propertyBag.Referrer = crawlStep; // Assign initial properties to propertybag if (!counterCookie.CrawlerQueueEntry.Properties.IsNull()) { counterCookie.CrawlerQueueEntry.Properties. ForEach(key => propertyBag[key.Key].Value = key.Value); } if (OnAfterDownload(crawlStep, propertyBag)) { // Executes all the pipelines sequentially for each downloaded content // in the crawl process. Used to extract data from content, like which // url's to follow, email addresses, aso. Pipeline.ForEach(pipelineStep => ExecutePipeLineStep(pipelineStep, propertyBag)); } } } ProcessQueue(); }
public void Crawl(Uri uri, PropertyBag referer) { int depth = referer?.Step?.Depth + 1 ?? 0; _transformBlock.Post(new PropertyBag { Step = new CrawlStep(uri, depth), Referrer = referer?.Referrer, UserAgent = _userAgent }); }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { //CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; //string cultureDisplayValue = "N/A"; //if (!contentCulture.IsNull()) //{ // cultureDisplayValue = contentCulture.DisplayName; //} lock (this) { //EchoControl.Invoke(new ShowTitleDelegate(ShowTitle), propertyBag.Title); InvokeOneWorkFinished(propertyBag.Title); //Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", System.Threading.Thread.CurrentThread.ManagedThreadId); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse); //Console.Out.WriteLine(); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { Console.Out.WriteLine(propertyBag.Step.Uri); }
public void Process(Crawler crawler, PropertyBag propertyBag) { foreach (Uri uri in seeds) { crawler.AddStep(uri, 2); } }
public Task RunAsync() { TransformBlock<Uri, PropertyBag> ingestBlock = new TransformBlock<Uri, PropertyBag>(input => { PropertyBag result = new PropertyBag { OriginalUrl = input.ToString(), UserAgent = _userAgent, Step = new CrawlStep(input, 0) }; return result; }, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }); TransformBlock<PropertyBag, PropertyBag> ingestBlockForAggregation = new TransformBlock<PropertyBag, PropertyBag>(input => input, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }); CrawlIngestionHelper crawlIngestionHelper = new CrawlIngestionHelper(ingestBlockForAggregation, _userAgent); TransformBlock<PropertyBag, PropertyBag>[] pipeline = Pipeline .Select(pipelineStep => { return new TransformBlock<PropertyBag, PropertyBag>(async propertyBag => { if (propertyBag.StopPipelining) { return propertyBag; } try { propertyBag.StopPipelining = !await pipelineStep.Process(crawlIngestionHelper, propertyBag); } catch (Exception exception) { propertyBag.Exceptions.Add(exception); } return propertyBag; }, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = pipelineStep.MaxDegreeOfParallelism }); }) .ToArray(); ActionBlock<PropertyBag> terminationCheckerBlock = new ActionBlock<PropertyBag>(propertyBag => { if (ingestBlock.InputCount == 0 && ingestBlock.OutputCount == 0 && !ingestBlock.Completion.IsCompleted && !ingestBlock.Completion.IsCanceled && !ingestBlock.Completion.IsFaulted && ingestBlockForAggregation.InputCount == 0 && ingestBlockForAggregation.OutputCount == 0) { if (pipeline.Any(transformBlock => transformBlock.InputCount != 0 || transformBlock.OutputCount != 0)) { return; } ingestBlock.Complete(); } }, new ExecutionDataflowBlockOptions {MaxDegreeOfParallelism = 1}); ingestBlock.LinkTo(ingestBlockForAggregation, new DataflowLinkOptions {PropagateCompletion = true}); TransformBlock<PropertyBag, PropertyBag> previous = ingestBlockForAggregation; foreach (TransformBlock<PropertyBag, PropertyBag> transformBlock in pipeline) { previous.LinkTo(transformBlock, new DataflowLinkOptions {PropagateCompletion = true}); previous = transformBlock; } previous.LinkTo(terminationCheckerBlock, new DataflowLinkOptions {PropagateCompletion = true}); foreach (Uri startUri in StartUris) { ingestBlock.Post(startUri); } return terminationCheckerBlock.Completion; }
public void Process(Crawler crawler, PropertyBag propertyBag) { HttpContext.Current.Response.Write("<br>FindUrl:" + propertyBag.Step.Uri); }