/// <summary> /// Executes specific scraping logic for passed scraper. /// (Only role is message propagation)! /// </summary> /// <param name="browser">Headless browwser instance</param> /// <param name="scrapers">passed site scrapers scrapers</param> public DataflowPipelineClass(ScrapingBrowser browser, ISiteSpecific scraper, IRealTimePublisher realTimePublisher, IDataConsumer dataConsumer) { this._browser = browser; this._specificScraper = scraper; this._realTimeFeedPublisher = realTimePublisher; this._dataConsumer = dataConsumer; }
protected async Task InitSingleTDataflowPipeline(ISiteSpecific scraper) { //TODO: await completion , than start next scraper (in future if i have more threads ...can make few pipes run in parallel as well) //TODO: throw this class init and cts , one leayer out ..into "RunAll" method when i'll have more scrapers running //-->>>later make method where we continously scrape on same pipeline aka dont init DataflowPipelineClass() here var cts = new CancellationTokenSource(); // init new TPL pipeline for each new scraper , and all other requred classes in pipeline var pipeline = new DataflowPipelineClass(_browser, scraper, new RealTimePublisher(_hubConnection, _args), new DataConsumer()); try { await Task.Run(async() => //TODO:might need to be moved inside try to catch ex { try { await pipeline.StartPipelineAsync(cts.Token); } catch (AggregateException ae) { //NOTE :each exception in TPL DF will wrap it in its own layer of AggregateException //ae.Flatten(); to pull out nested exception under aggregate exceptions thrown form tpl pipeline Console.WriteLine($"Pipeline {PipeIndex} terminated due to error {ae}"); } Console.WriteLine($"Pipe -->[{++PipeIndex}] done processing Messages!"); }); } catch (Exception ex) { Console.WriteLine(ex.Message); //throw ex; } #region TPL Channels option //2 .or use TPLChannels instead ... (in my case 3rd option is better!) //var channel = new TPLChannelsClass(); //channel.EnqueueAsync("llalall"); //3. or directly output to signalR since it uses channels too // with RealTimePublisher -->PublishMessageToHub() //_realTimeFeedPublisher.PublishMessageToHub(); #endregion TPL Channels option }
private async Task ConsumeWithDiscard(ITargetBlock <Message> target, CancellationToken token, ISiteSpecific scraper)//Maybe make this method async IAsyncEnumerable so can push msgs as they arrive { if (scraper.Url == "http://nabava.net") { //TODO :this is F****D ..ERROR IS IM NOW AWAITING RESULT IN ASYNC METHOD ,replace with separate method that only Fetches markup var scrapedData = await scraper.ScrapeWebshops(); //TODO: streaming atm streams x100 or to fast anyway for some reasonable data... maybe make timer that sends batches of data every min or so !!!! //while (!token.IsCancellationRequested) //{ foreach (string item in scrapedData.Item1) //Right now im just posting same webshops over and over to pipeline { //map, than Pass msg to pipeline var message = new Message(); //message.SourceHtml = //scraped data message.Id = _counter; message.SiteUrl = item; message.Read = DateTime.Now; _counter++; Console.WriteLine($"Read mdg num[{_counter}] from [{message.SiteUrl}] @ [{message.Read}] on thread [{Thread.CurrentThread.ManagedThreadId}]");// temp logging var post = target.Post(message); if (!post) { Console.WriteLine("Buffer full, Could not post!"); } } target.Complete(); //} } //await foreach (var item in collection) //{ // //replace while loop with this one // //pass fetched markup here to forward msgs to pipeline //} //PREVOUS VERSION ... //while (!token.IsCancellationRequested) //{ // //TODO : in current state , i should init scraping here and post it into pipeline 1by 1 (for that i would need to pass "ITargetBlock<Message> target" as param to scraper) // //scraper.RunInitMsg(...,...,target) // var message = new Message(); // //message.SourceHtml = //scraped data // message.Id = _counter; // message.SiteUrl = scraper.Url; // message.Read = DateTime.Now; // _counter++; // Console.WriteLine($"Read message num[{_counter}] from [{scraper.Url}] on thread [{Thread.CurrentThread.ManagedThreadId}]");// temp logging // var post = target.Post(message); // if (!post) // Console.WriteLine("Buffer full, Could not post!"); //} }
/// <summary> ///This is the entry point into the TPL dataflow , data is than propagated through TPL blocks in pipeline (1stblock (TransformBlock) in my case) /// </summary> /// <see cref=""/> public Task StartConsuming(ITargetBlock <Message> target, CancellationToken token, ISiteSpecific scraper) { return(Task.Factory.StartNew(async() => await ConsumeWithDiscard(target, token, scraper), TaskCreationOptions.LongRunning)); }