public async Task LogCrawl(CrawlDescription crawlDescription, CrawlResult crawlResult)
        {
            // mark proxy as used to unlock for later usage
            _proxyManager.UnLock(crawlDescription.SearchEngineID, crawlDescription.RegionID,
                                 crawlDescription.ProxyID, crawlResult.CrawlResultID);

            // update last crawl
            await Task.Run(() =>
            {
                _keywordScrapeDetailRepo.UpdateLastCrawl(
                    crawlDescription.SearchEngineID,
                    crawlDescription.RegionID,
                    crawlDescription.CityID,
                    crawlDescription.KeywordID,
                    DateTime.Now);
            });

            // TODO(zvp): Write to flat files
        }
예제 #2
0
 public Scraper(CrawlDescription crawlDescription)
 {
     CrawlDescription = crawlDescription ?? throw new ArgumentNullException(nameof(crawlDescription));
 }
예제 #3
0
        public async Task Start()
        {
            int c = 0;

            while (!_cancellationToken.IsCancellationRequested)
            {
                // pause (let it finish current scrape)
                await _manualResetEvent.WaitAsync();

                CrawlDescription crawlDescription = await _scraperQueue.Dequeue();

                CrawlResult crawlResult = null;

                await _crawlLogger.LogCrawl(crawlDescription, new CrawlResult
                {
                    CrawlResultID = Shared.Enum.CrawlResultID.Success
                });

                c += 1;
                Log.Information("Crawled Keyword: {0}, Proxy: {1}, SearchString: {2}, Count: {3}",
                                crawlDescription.Keyword, crawlDescription.IP, crawlDescription.SearchString, c);

                continue;

                // pause (let it finish dequeue)
                await _manualResetEvent.WaitAsync();

                Process pipeClient = new Process
                {
                    StartInfo = new ProcessStartInfo
                    {
                        FileName         = "WebScraper.exe", // TODO(zvp): Don't hardcode this name
                        CreateNoWindow   = false,
                        UseShellExecute  = false,
                        WindowStyle      = ProcessWindowStyle.Hidden,
                        WorkingDirectory = AppDomain.CurrentDomain.BaseDirectory
                    }
                };

                // For In-Out Inter-Process Communication
                using (AnonymousPipeServerStream pipeServerWriter =
                           new AnonymousPipeServerStream(PipeDirection.Out,
                                                         HandleInheritability.Inheritable))
                    using (AnonymousPipeServerStream pipeServerReader =
                               new AnonymousPipeServerStream(PipeDirection.In,
                                                             HandleInheritability.Inheritable))
                    {
                        // Start Pipe Client (WebScraper.exe)
                        pipeClient.StartInfo.Arguments =
                            $"{pipeServerWriter.GetClientHandleAsString()} {pipeServerReader.GetClientHandleAsString()}";
                        pipeClient.Start();

                        // release object handles
                        pipeServerWriter.DisposeLocalCopyOfClientHandle();
                        pipeServerReader.DisposeLocalCopyOfClientHandle();

                        try
                        {
                            using (StreamWriter sw = new StreamWriter(pipeServerWriter))
                            {
                                // flush after every write
                                sw.AutoFlush = true;

                                // write sync message
                                await sw.WriteLineAsync("SYNC");

                                pipeServerWriter.WaitForPipeDrain();

                                // write crawl description
                                string serializedCrawlDescription = JsonConvert.SerializeObject(crawlDescription);
                                await sw.WriteLineAsync(serializedCrawlDescription);
                            }

                            using (StreamReader sr = new StreamReader(pipeServerReader))
                            {
                                string message;

                                do
                                {
                                    // TODO(zvp) : have to exit eventually.
                                    message = await sr.ReadLineAsync();

                                    Log.Debug("Pipe Received Message: {0}", message);
                                } while (message == null || !message.StartsWith("SYNC"));

                                message = await sr.ReadLineAsync();

                                crawlResult = JsonConvert.DeserializeObject <CrawlResult>(message);
                                Log.Debug("Pipe Received Crawl Result: {0}", message);
                            }
                        }
                        catch (Exception ex)
                        {
                            Log.Error("WebScraper Exception({0}): {1}", ex.GetType(), ex.Message);
                        }
                        finally
                        {
                            // wait for client to shutdown
                            pipeClient.WaitForExit();
                            // free resources
                            pipeClient.Close();
                        }
                    }

                await _crawlLogger.LogCrawl(crawlDescription, new CrawlResult
                {
                    CrawlResultID = Shared.Enum.CrawlResultID.Success
                });

                c += 1;
                Log.Information("Crawled Keyword: {0}, Proxy: {1}, SearchString: {2}, Count: {3}",
                                crawlDescription.Keyword, crawlDescription.IP, crawlDescription.SearchString, c);
            }
        }