/// <summary>
 /// Executes specific scraping logic for passed scraper.
 /// (Only role is message propagation)!
 /// </summary>
 /// <param name="browser">Headless browwser instance</param>
 /// <param name="scrapers">passed site scrapers scrapers</param>
 public DataflowPipelineClass(ScrapingBrowser browser,
                              ISiteSpecific scraper,
                              IRealTimePublisher realTimePublisher,
                              IDataConsumer dataConsumer)
 {
     this._browser               = browser;
     this._specificScraper       = scraper;
     this._realTimeFeedPublisher = realTimePublisher;
     this._dataConsumer          = dataConsumer;
 }
Beispiel #2
0
        protected async Task InitSingleTDataflowPipeline(ISiteSpecific scraper)
        {
            //TODO:  await completion , than start next scraper (in future if i have more threads ...can make few pipes run in parallel as well)
            //TODO: throw this class init and cts , one leayer out ..into "RunAll" method when i'll have more scrapers running
            //-->>>later make method where we continously scrape on same pipeline aka dont init DataflowPipelineClass() here

            var cts = new CancellationTokenSource();
            // init new TPL pipeline for each new scraper , and all other requred classes in pipeline

            var pipeline = new DataflowPipelineClass(_browser, scraper, new RealTimePublisher(_hubConnection, _args), new DataConsumer());

            try
            {
                await Task.Run(async() =>  //TODO:might need to be moved inside try to catch  ex
                {
                    try
                    {
                        await pipeline.StartPipelineAsync(cts.Token);
                    }
                    catch (AggregateException ae)
                    {
                        //NOTE :each exception in TPL DF will wrap it in its own layer of AggregateException
                        //ae.Flatten(); to pull out nested exception under aggregate exceptions thrown form tpl pipeline

                        Console.WriteLine($"Pipeline {PipeIndex} terminated due to error {ae}");
                    }
                    Console.WriteLine($"Pipe -->[{++PipeIndex}] done processing Messages!");
                });
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                //throw ex;
            }

            #region TPL Channels option

            //2 .or use TPLChannels instead ... (in my case 3rd option is better!)
            //var channel = new TPLChannelsClass();
            //channel.EnqueueAsync("llalall");

            //3. or directly output to signalR since it uses channels too
            // with RealTimePublisher -->PublishMessageToHub()
            //_realTimeFeedPublisher.PublishMessageToHub();

            #endregion TPL Channels option
        }
Beispiel #3
0
        private async Task ConsumeWithDiscard(ITargetBlock <Message> target, CancellationToken token, ISiteSpecific scraper)//Maybe make this method async IAsyncEnumerable so can push msgs as they arrive
        {
            if (scraper.Url == "http://nabava.net")
            {
                //TODO :this is F****D ..ERROR IS IM NOW AWAITING RESULT IN ASYNC METHOD ,replace with separate method that only Fetches markup
                var scrapedData = await scraper.ScrapeWebshops();

                //TODO: streaming atm streams x100 or to fast anyway for some reasonable data... maybe make timer that sends batches of data every min or so !!!!

                //while (!token.IsCancellationRequested)
                //{
                foreach (string item in scrapedData.Item1) //Right now im just posting same webshops over and over to pipeline
                {
                    //map, than Pass msg to pipeline
                    var message = new Message();
                    //message.SourceHtml = //scraped data
                    message.Id      = _counter;
                    message.SiteUrl = item;
                    message.Read    = DateTime.Now;

                    _counter++;
                    Console.WriteLine($"Read mdg num[{_counter}] from [{message.SiteUrl}] @ [{message.Read}] on thread [{Thread.CurrentThread.ManagedThreadId}]");// temp logging

                    var post = target.Post(message);
                    if (!post)
                    {
                        Console.WriteLine("Buffer full, Could not post!");
                    }
                }
                target.Complete();
                //}
            }

            //await foreach (var item in collection)
            //{
            //    //replace while loop with  this one
            //    //pass fetched markup here to forward msgs to pipeline
            //}

            //PREVOUS VERSION ...
            //while (!token.IsCancellationRequested)
            //{
            //    //TODO : in current state , i should init scraping here and post it into pipeline 1by 1 (for that i would need to pass "ITargetBlock<Message> target"  as param to scraper)
            //    //scraper.RunInitMsg(...,...,target)
            //    var message = new Message();
            //    //message.SourceHtml = //scraped data
            //    message.Id = _counter;
            //    message.SiteUrl = scraper.Url;
            //    message.Read = DateTime.Now;

            //    _counter++;
            //    Console.WriteLine($"Read message num[{_counter}] from [{scraper.Url}] on thread [{Thread.CurrentThread.ManagedThreadId}]");// temp logging

            //    var post = target.Post(message);
            //    if (!post)
            //        Console.WriteLine("Buffer full, Could not post!");
            //}
        }
Beispiel #4
0
 /// <summary>
 ///This is the entry point into the TPL dataflow , data is than propagated through TPL blocks in pipeline (1stblock (TransformBlock) in my case)
 /// </summary>
 /// <see cref=""/>
 public Task StartConsuming(ITargetBlock <Message> target, CancellationToken token, ISiteSpecific scraper)
 {
     return(Task.Factory.StartNew(async() => await ConsumeWithDiscard(target, token, scraper), TaskCreationOptions.LongRunning));
 }