Esempio n. 1
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="directory">Directory to save fetched web files</param>
        public SimpleFetcher(CrawlerConfig config)
        {
            mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"));
            if (Directory.Exists(config.FetchDirectory))
            {
                Directory.Delete(config.FetchDirectory, true);
            }
            Directory.CreateDirectory(config.FetchDirectory);

            mConfig = config;

            var handler = new HttpClientHandler
            {
                UseProxy = false,
            };

            mClientWithoutProxy = new HttpClient(handler);
            mClientWithProxy    = new HttpClient();

            mClientWithoutProxy.DefaultRequestHeaders.Connection.Add("keep-alive");
            mClientWithoutProxy.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent);
            mClientWithoutProxy.DefaultRequestHeaders.Accept.ParseAdd("*/*");
            mClientWithoutProxy.DefaultRequestHeaders.AcceptLanguage.ParseAdd("en,zh-CN;q=0.9,zh;q=0.8,zh-TW;q=0.7,de;q=0.6,ru;q=0.5");
            mClientWithoutProxy.Timeout = new TimeSpan(0, 0, 8);

            mClientWithProxy.DefaultRequestHeaders.Connection.Add("keep-alive");
            mClientWithProxy.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent);
            mClientWithoutProxy.DefaultRequestHeaders.Accept.ParseAdd("*/*");
            mClientWithoutProxy.DefaultRequestHeaders.AcceptLanguage.ParseAdd("en,zh-CN;q=0.9,zh;q=0.8,zh-TW;q=0.7,de;q=0.6,ru;q=0.5");
            mClientWithProxy.Timeout = new TimeSpan(0, 0, 30);
        }
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="config">Crawler config</param>
 public SimpleUrlFrontier(CrawlerConfig config)
 {
     mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.log"));
     mConfig = config;
     config.UrlFrontierItemStore.Reload();
     config.UrlFrontierItemStore.Init(config.InitUrls);
 }
Esempio n. 3
0
 public SitemapLinkParser(Uri root, Uri parent, string content, CrawlerConfig config)
 {
     _root    = root;
     _parent  = parent;
     _content = content;
     _config  = config;
 }
Esempio n. 4
0
 /// <summary>
 /// Creates a new YouTube configuration instance.
 /// </summary>
 public YtConfig(CrawlerConfig config)
 {
     // Create the YouTube settings.
     this.settings = new YouTubeSettings(config.YouTubeV2ApiKey);
     // Create the YouTube categories.
     this.categories = new YouTubeCategories(config.YouTubeCategoriesFileName);
 }
Esempio n. 5
0
        public void AddDoubanConfig(string groupId, string cityName)
        {
            if (string.IsNullOrEmpty(groupId) || string.IsNullOrEmpty(cityName))
            {
                throw new Exception("请输入豆瓣小组Group和城市名称。");
            }
            var topics = DoubanService.GetHouseData(groupId, cityName, 1);

            if (topics == null)
            {
                throw new Exception("保存失败!请检查豆瓣小组ID(如:XMhouse)/城市名称(如:厦门)是否正确...");
            }
            var cityInfo     = $"{{ 'groupid':'{groupId}','cityname':'{cityName}','pagecount':5}}";
            var doubanConfig = new CrawlerConfig();

            if (doubanConfig != null)
            {
                return;
            }
            var config = new CrawlerConfig()
            {
                ConfigurationKey   = 0,
                ConfigurationValue = cityInfo,
                ConfigurationName  = ConstConfigName.Douban,
                DataCreateTime     = DateTime.Now,
                IsEnabled          = true,
            };

            _dapper.Insert(config);
            return;
        }
Esempio n. 6
0
        public void TheRootAddressShouldBeCrawled()
        {
            var config = new CrawlerConfig
            {
                RootAddress  = new Uri("http://localhost:51746/"),
                Listener     = this,
                MaxDepth     = 1,
                CrawlerFlags = CrawlerFlags.IncludeLinks | CrawlerFlags.IncludeFailureCheck
            };

            Crawler.Crawl(config);
        }
Esempio n. 7
0
        private HashSet <string> GetArticleUrls(CrawlerConfig config)
        {
            //sử dụng set để k bị duplicate
            HashSet <string> setLink = new HashSet <string>();
            //Load trang web, nạp html vào document

            HtmlDocument document = null;
            var          url      = config.Route + config.Path;

            try
            {
                document = _htmlWeb.Load(url);
            }
            catch (Exception err)
            {
                Console.WriteLine("LOAD HTML DOC FAILED: " + err.Message);
                return(setLink);
            }
            //lấy ra toàn bộ thẻ a
            var aItems       = document.DocumentNode.QuerySelectorAll(config.LinkSelector).ToList();
            var ExistedLinks = from article in _db.Articles select article.Link;

            //lấy aritcle url và lưu vào set
            foreach (var item in aItems)
            {
                var hrefValue = item.Attributes["href"].Value;
                var link      = "";
                //validate url
                if (!link.Contains(config.Route))
                {
                    link = config.Route + hrefValue;
                }
                else
                {
                    link = hrefValue;
                }
                if (!ValidateHelper.IsUrlValid(link))
                {
                    Console.WriteLine($"[Not valid]: {link}");
                    continue;
                }
                //check link duplicate trong db
                if (ExistedLinks.Contains(link))
                {
                    Console.WriteLine($"[Existed]: {link}");
                    continue;
                }
                Console.WriteLine($"[Get success]: {link}");
                setLink.Add(link);
            }

            return(setLink);
        }
        public void TheCrawlShouldRecordAnError()
        {
            var config = new CrawlerConfig
            {
                RootAddress  = new Uri("http://PageNotFound/"),
                Listener     = this,
                MaxDepth     = 2,
                CrawlerFlags = CrawlerFlags.IncludeLinks
            };

            Crawler.Crawl(config);
        }
Esempio n. 9
0
        public long NewCrawl(string baseUrl, CrawlerConfig config)
        {
            var jsonConfig = JsonSerializer.Serialize(config, new JsonSerializerOptions {
                WriteIndented = true, PropertyNamingPolicy = JsonNamingPolicy.CamelCase
            });
            var c = new Crawl {
                BaseUrl = baseUrl, Configuration = jsonConfig
            };

            _db.Insert(c);
            return(c.Id);
        }
Esempio n. 10
0
        private string PackConfigToJson(CrawlerConfig config, string url)
        {
            var packed = new
            {
                Link     = url,
                ConfigId = config.Id
            };
            var jsonPacked = JsonConvert.SerializeObject(packed);

            Console.WriteLine("[Packed]: " + url);
            return(jsonPacked);
        }
        public void TheCrawlShouldRecordAnError()
        {
            var config = new CrawlerConfig
            {
                RootAddress  = new Uri("http://localhost:51746/katelyn-error.html"),
                Listener     = this,
                MaxDepth     = 2,
                CrawlerFlags = CrawlerFlags.IncludeFailureCheck
            };

            Crawler.Crawl(config);
        }
Esempio n. 12
0
        public void TheRobotsFileShouldBeCrawledAndSitemapLinksFollowed()
        {
            var config = new CrawlerConfig
            {
                RootAddress = new Uri("http://localhost:51746/"),
                Listener    = this,
                MaxDepth    = 10,
            };

            config.CrawlerFlags |= CrawlerFlags.IncludeRobots;

            Crawler.Crawl(config);
        }
Esempio n. 13
0
        public void ThenMatchesShouldBeFound()
        {
            var config = new CrawlerConfig
            {
                RootAddress           = new Uri("http://localhost:51746/"),
                Listener              = this,
                MaxDepth              = 1,
                CrawlerFlags          = CrawlerFlags.IncludeLinks,
                HtmlContentExpression = new Regex("#search-link")
            };

            Crawler.Crawl(config);
        }
Esempio n. 14
0
        public void ScriptTagsShouldBeCrawled()
        {
            var config = new CrawlerConfig
            {
                RootAddress = new Uri("http://localhost:51746/"),
                Listener    = this,
                MaxDepth    = 2,
            };

            config.CrawlerFlags |= CrawlerFlags.IncludeLinks;
            config.CrawlerFlags |= CrawlerFlags.IncludeScripts;

            Crawler.Crawl(config);
        }
Esempio n. 15
0
        public void Insert(CrawlerConfig conf)
        {
            string sqlText = @"INSERT INTO `housecrawler`.`CrawlerConfigurations`
             (`ConfigurationName`, `ConfigurationValue`, `ConfigurationKey`, `IsEnabled`) 
             VALUES (@ConfigurationName,@ConfigurationValue, @ConfigurationKey,1);";

            using (IDbConnection dbConnection = GetConnection())
            {
                dbConnection.Open();
                IDbTransaction transaction = dbConnection.BeginTransaction();
                var            result      = dbConnection.Execute(sqlText,
                                                                  conf, transaction: transaction);
                transaction.Commit();
            }
        }
Esempio n. 16
0
        public void ThePartnerLinkShouldBeCrawled()
        {
            var config = new CrawlerConfig
            {
                RootAddress  = new Uri("http://localhost:51746/partner.html"),
                Listener     = this,
                MaxDepth     = 5,
                PartnerSites = new List <Uri> {
                    new Uri("https://example.com")
                },
                CrawlerFlags = CrawlerFlags.IncludeLinks
            };

            Crawler.Crawl(config);
        }
Esempio n. 17
0
 public Crawler(CrawlerConfig config,
                IUrlFrontier urlFrontier,
                IFetcher fetcher,
                ISimilarContentManager similarContentManager,
                List <IUrlFilter> urlFilters)
 {
     mConfig               = config;
     Status                = CrawlerStatus.STOPPED;
     mUrlFrontier          = urlFrontier;
     mFetcher              = fetcher;
     mSimilarContentJudger = similarContentManager;
     mUrlFilters           = urlFilters;
     mLogger               = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"), true);
     mErrorLogger          = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler Error.Log"), false);
 }
Esempio n. 18
0
        public static ContentParser <Uri> GetLinkParser(CrawlerConfig config, Uri parent, string content, string contentType)
        {
            switch (contentType)
            {
            case "text/html":
                return(new HtmlLinkParser(config.RootAddress, parent, content, config));

            case "text/plain":
                // robots.txt
                return(new RobotLinkParser(config.RootAddress, parent, content, config));

            case "text/xml":
                // sitemap.xml
                return(new SitemapLinkParser(config.RootAddress, parent, content, config));

            default:
                // Unsupported content type - we still load and measure, but don't look for links
                return(new EmptyLinkParser <Uri>());
            }
        }
Esempio n. 19
0
        public void Start()
        {
            PoliteWebCrawler crawler = new CrawlerConfig().CreateCrawler();

            CrawlResult result =
                crawler.Crawl(
                    //new Uri("https://www.komputronik.pl/category/17631/lenovo-ideapad.html"));
                    //new Uri("https://www.komputronik.pl/category/17623/laptopy-lenovo.html"));
                    //new Uri("https://www.komputronik.pl/category/5022/laptopy.html")); // <- ten jest spoko
                    new Uri("https://www.komputronik.pl/category/5801/komputery-pc.html"));


            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri,
                                  result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                Console.WriteLine("Saved successfully");
            }
        }
Esempio n. 20
0
        private static CrawlerConfig GetComplexConfig(string address, bool verbose, bool includeImages, bool includeLinks, bool includeScripts, bool includeStyles, bool includeFailureCheck, bool includeRobots, int maxDepth, int delay, string searchExpression, string partnerSites)
        {
            var config = new CrawlerConfig
            {
                RootAddress = new Uri(address),
                Listener    = (verbose) ? new ConsoleListener() : new SparseConsoleListener()
            };

            if (!string.IsNullOrWhiteSpace(searchExpression))
            {
                config.HtmlContentExpression = new Regex(searchExpression);
            }

            if (!string.IsNullOrWhiteSpace(partnerSites))
            {
                config.PartnerSites = partnerSites.Split(',').Select(s => new Uri(s)).ToList();
            }

            if (maxDepth > 0)
            {
                config.MaxDepth = maxDepth;
            }

            if (delay > 0)
            {
                config.CrawlDelay = TimeSpan.FromMilliseconds(delay);
            }

            config.AddCrawlerFlag(() => includeLinks, CrawlerFlags.IncludeLinks);
            config.AddCrawlerFlag(() => includeImages, CrawlerFlags.IncludeImages);
            config.AddCrawlerFlag(() => includeScripts, CrawlerFlags.IncludeScripts);
            config.AddCrawlerFlag(() => includeStyles, CrawlerFlags.IncludeStyles);
            config.AddCrawlerFlag(() => includeFailureCheck, CrawlerFlags.IncludeFailureCheck);
            config.AddCrawlerFlag(() => includeRobots, CrawlerFlags.IncludeRobots);

            return(config);
        }
        public IHttpActionResult CreateCrawlerConfig(CrawlerConfigDataBindingModel crawlerConfigDataBindingModel)
        {
            //to do check trung route+path
            //List<CrawlerConfig> existedCrawlerConfigs = _db.CrawlerConfigs.Where(c => c.Route == crawlerConfigDataBindingModel.Route).
            if (!ModelState.IsValid)
            {
                return(BadRequest(ModelState));
            }
            var newConfig = new CrawlerConfig()
            {
                Route               = crawlerConfigDataBindingModel.Route,
                CategoryId          = crawlerConfigDataBindingModel.CategoryId,
                ContentSelector     = crawlerConfigDataBindingModel.ContentSelector,
                DescriptionSelector = crawlerConfigDataBindingModel.DescriptionSelector,
                LinkSelector        = crawlerConfigDataBindingModel.LinkSelector,
                RemovalSelector     = crawlerConfigDataBindingModel.RemovalSelector,
                Path          = crawlerConfigDataBindingModel.Path,
                TitleSelector = crawlerConfigDataBindingModel.TitleSelector
            };

            _db.CrawlerConfigs.Add(newConfig);
            _db.SaveChanges();
            return(Json(newConfig));
        }
Esempio n. 22
0
        static void Main(string[] args)
        {
            var cts = new CancellationTokenSource();

            Console.CancelKeyPress += (sender, e) => {
                if (cts.IsCancellationRequested)
                {
                    return;
                }
                cts.Cancel(false);
                e.Cancel = true;
            };

            //ssh -L 127.0.0.1:2375:/var/run/docker.sock

            var logger   = new LambdaLogger(Console.WriteLine);
            var jsonProc = new JsonProcessor(logger);
            var nodes    = new HashSet <string>();

            jsonProc.OnEot  += () => { Console.WriteLine("-- EOT! --"); };
            jsonProc.OnNode += (n) => {
                if (!nodes.Add(n.Url))
                {
                    Console.WriteLine($"!! Duplicate URL:");
                }
                Console.WriteLine(n.Title);
                Console.WriteLine(n.Url);
                Console.WriteLine($"{nodes.Count} Nodes");
            };
            jsonProc.OnEdges += (e) => { Console.WriteLine($"{e.Edges.Count} Edges"); };

            var baseUrl       = "https://www.ichkoche.at/";
            var crawlerConfig = new CrawlerConfig {
                FollowInternalLinks = true,
                CheckExternalLinks  = false,
                MaxRequestsPerCrawl = 500,
                TakeScreenShots     = false,
                RequestQueue        = { baseUrl },
                UrlFilter           = $"{baseUrl}[.*]",
                MaxConcurrency      = 6,
            };
            //var env = "CRAWLER_CONFIG='" + jsonProc.Serialize(crawlerConfig) + "'";
            //Console.WriteLine(env);
            var image      = "quay.io/0xff/apify-crawler3:master";
            var dockerHost = "tcp://127.0.0.1:2375/";


            var si = new ProcessStartInfo {
                CreateNoWindow = true,
                FileName       = "docker",
                ArgumentList   =
                {
                    "run", "--rm", "-e", "CRAWLER_CONFIG", image
                },
                Environment =
                {
                    { "DOCKER_HOST",    dockerHost                        },
                    { "CRAWLER_CONFIG", jsonProc.Serialize(crawlerConfig) }
                },
                RedirectStandardInput  = true,
                RedirectStandardError  = true,
                RedirectStandardOutput = true,
                UseShellExecute        = false,
            };


            using (var p = new Process {
                StartInfo = si
            }) {
                try {
                    void DataReceived(object sender, DataReceivedEventArgs eventArgs)
                    {
                        if (eventArgs.Data != null)
                        {
                            jsonProc.ProcessMessage(eventArgs.Data);
                        }
                    }
                    p.OutputDataReceived += DataReceived;
                    p.ErrorDataReceived  += DataReceived;
                    p.Start();
                    p.BeginErrorReadLine();
                    p.BeginOutputReadLine();
                    while (!cts.IsCancellationRequested)
                    {
                        if (p.WaitForExit(1000))
                        {
                            break;
                        }
                    }
                    if (cts.IsCancellationRequested)
                    {
                        Console.WriteLine("Ctrl+C");
                        p.StandardInput.WriteLine("\x3");
                        p.StandardInput.Close();
                        p.StandardOutput.Close();
                        p.StandardError.Close();
                        p.WaitForExit(20000);
                    }
                } finally {
                    p.Kill(true);
                }

                Console.WriteLine($"ExitCode={p.ExitCode}");
            }
        }
Esempio n. 23
0
 public SimpleSimilarContentManager(CrawlerConfig config)
 {
     mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"), true);
     mConfig = config;
 }
Esempio n. 24
0
 public PageDownloader(IPageExtractor pageExtractor, CrawlerConfig config)
 {
     _pageExtractor = pageExtractor ?? throw new ArgumentNullException(nameof(IPageExtractor));
     _config        = config;
 }
Esempio n. 25
0
        public void FeedsToCrawl_Is_Never_Null()
        {
            CrawlerConfig config = new CrawlerConfig();

            Assert.NotNull(config.FeedsToCrawl);
        }
Esempio n. 26
0
        public void TwitterUsersToCrawl_Is_Never_Null()
        {
            CrawlerConfig config = new CrawlerConfig();

            Assert.NotNull(config.TwitterUsersToCrawl);
        }