/// <summary> /// Constructor /// </summary> /// <param name="directory">Directory to save fetched web files</param> public SimpleFetcher(CrawlerConfig config) { mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log")); if (Directory.Exists(config.FetchDirectory)) { Directory.Delete(config.FetchDirectory, true); } Directory.CreateDirectory(config.FetchDirectory); mConfig = config; var handler = new HttpClientHandler { UseProxy = false, }; mClientWithoutProxy = new HttpClient(handler); mClientWithProxy = new HttpClient(); mClientWithoutProxy.DefaultRequestHeaders.Connection.Add("keep-alive"); mClientWithoutProxy.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent); mClientWithoutProxy.DefaultRequestHeaders.Accept.ParseAdd("*/*"); mClientWithoutProxy.DefaultRequestHeaders.AcceptLanguage.ParseAdd("en,zh-CN;q=0.9,zh;q=0.8,zh-TW;q=0.7,de;q=0.6,ru;q=0.5"); mClientWithoutProxy.Timeout = new TimeSpan(0, 0, 8); mClientWithProxy.DefaultRequestHeaders.Connection.Add("keep-alive"); mClientWithProxy.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgent); mClientWithoutProxy.DefaultRequestHeaders.Accept.ParseAdd("*/*"); mClientWithoutProxy.DefaultRequestHeaders.AcceptLanguage.ParseAdd("en,zh-CN;q=0.9,zh;q=0.8,zh-TW;q=0.7,de;q=0.6,ru;q=0.5"); mClientWithProxy.Timeout = new TimeSpan(0, 0, 30); }
/// <summary> /// Constructor /// </summary> /// <param name="config">Crawler config</param> public SimpleUrlFrontier(CrawlerConfig config) { mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.log")); mConfig = config; config.UrlFrontierItemStore.Reload(); config.UrlFrontierItemStore.Init(config.InitUrls); }
public SitemapLinkParser(Uri root, Uri parent, string content, CrawlerConfig config) { _root = root; _parent = parent; _content = content; _config = config; }
/// <summary> /// Creates a new YouTube configuration instance. /// </summary> public YtConfig(CrawlerConfig config) { // Create the YouTube settings. this.settings = new YouTubeSettings(config.YouTubeV2ApiKey); // Create the YouTube categories. this.categories = new YouTubeCategories(config.YouTubeCategoriesFileName); }
public void AddDoubanConfig(string groupId, string cityName) { if (string.IsNullOrEmpty(groupId) || string.IsNullOrEmpty(cityName)) { throw new Exception("请输入豆瓣小组Group和城市名称。"); } var topics = DoubanService.GetHouseData(groupId, cityName, 1); if (topics == null) { throw new Exception("保存失败!请检查豆瓣小组ID(如:XMhouse)/城市名称(如:厦门)是否正确..."); } var cityInfo = $"{{ 'groupid':'{groupId}','cityname':'{cityName}','pagecount':5}}"; var doubanConfig = new CrawlerConfig(); if (doubanConfig != null) { return; } var config = new CrawlerConfig() { ConfigurationKey = 0, ConfigurationValue = cityInfo, ConfigurationName = ConstConfigName.Douban, DataCreateTime = DateTime.Now, IsEnabled = true, }; _dapper.Insert(config); return; }
public void TheRootAddressShouldBeCrawled() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/"), Listener = this, MaxDepth = 1, CrawlerFlags = CrawlerFlags.IncludeLinks | CrawlerFlags.IncludeFailureCheck }; Crawler.Crawl(config); }
private HashSet <string> GetArticleUrls(CrawlerConfig config) { //sử dụng set để k bị duplicate HashSet <string> setLink = new HashSet <string>(); //Load trang web, nạp html vào document HtmlDocument document = null; var url = config.Route + config.Path; try { document = _htmlWeb.Load(url); } catch (Exception err) { Console.WriteLine("LOAD HTML DOC FAILED: " + err.Message); return(setLink); } //lấy ra toàn bộ thẻ a var aItems = document.DocumentNode.QuerySelectorAll(config.LinkSelector).ToList(); var ExistedLinks = from article in _db.Articles select article.Link; //lấy aritcle url và lưu vào set foreach (var item in aItems) { var hrefValue = item.Attributes["href"].Value; var link = ""; //validate url if (!link.Contains(config.Route)) { link = config.Route + hrefValue; } else { link = hrefValue; } if (!ValidateHelper.IsUrlValid(link)) { Console.WriteLine($"[Not valid]: {link}"); continue; } //check link duplicate trong db if (ExistedLinks.Contains(link)) { Console.WriteLine($"[Existed]: {link}"); continue; } Console.WriteLine($"[Get success]: {link}"); setLink.Add(link); } return(setLink); }
public void TheCrawlShouldRecordAnError() { var config = new CrawlerConfig { RootAddress = new Uri("http://PageNotFound/"), Listener = this, MaxDepth = 2, CrawlerFlags = CrawlerFlags.IncludeLinks }; Crawler.Crawl(config); }
public long NewCrawl(string baseUrl, CrawlerConfig config) { var jsonConfig = JsonSerializer.Serialize(config, new JsonSerializerOptions { WriteIndented = true, PropertyNamingPolicy = JsonNamingPolicy.CamelCase }); var c = new Crawl { BaseUrl = baseUrl, Configuration = jsonConfig }; _db.Insert(c); return(c.Id); }
private string PackConfigToJson(CrawlerConfig config, string url) { var packed = new { Link = url, ConfigId = config.Id }; var jsonPacked = JsonConvert.SerializeObject(packed); Console.WriteLine("[Packed]: " + url); return(jsonPacked); }
public void TheCrawlShouldRecordAnError() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/katelyn-error.html"), Listener = this, MaxDepth = 2, CrawlerFlags = CrawlerFlags.IncludeFailureCheck }; Crawler.Crawl(config); }
public void TheRobotsFileShouldBeCrawledAndSitemapLinksFollowed() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/"), Listener = this, MaxDepth = 10, }; config.CrawlerFlags |= CrawlerFlags.IncludeRobots; Crawler.Crawl(config); }
public void ThenMatchesShouldBeFound() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/"), Listener = this, MaxDepth = 1, CrawlerFlags = CrawlerFlags.IncludeLinks, HtmlContentExpression = new Regex("#search-link") }; Crawler.Crawl(config); }
public void ScriptTagsShouldBeCrawled() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/"), Listener = this, MaxDepth = 2, }; config.CrawlerFlags |= CrawlerFlags.IncludeLinks; config.CrawlerFlags |= CrawlerFlags.IncludeScripts; Crawler.Crawl(config); }
public void Insert(CrawlerConfig conf) { string sqlText = @"INSERT INTO `housecrawler`.`CrawlerConfigurations` (`ConfigurationName`, `ConfigurationValue`, `ConfigurationKey`, `IsEnabled`) VALUES (@ConfigurationName,@ConfigurationValue, @ConfigurationKey,1);"; using (IDbConnection dbConnection = GetConnection()) { dbConnection.Open(); IDbTransaction transaction = dbConnection.BeginTransaction(); var result = dbConnection.Execute(sqlText, conf, transaction: transaction); transaction.Commit(); } }
public void ThePartnerLinkShouldBeCrawled() { var config = new CrawlerConfig { RootAddress = new Uri("http://localhost:51746/partner.html"), Listener = this, MaxDepth = 5, PartnerSites = new List <Uri> { new Uri("https://example.com") }, CrawlerFlags = CrawlerFlags.IncludeLinks }; Crawler.Crawl(config); }
public Crawler(CrawlerConfig config, IUrlFrontier urlFrontier, IFetcher fetcher, ISimilarContentManager similarContentManager, List <IUrlFilter> urlFilters) { mConfig = config; Status = CrawlerStatus.STOPPED; mUrlFrontier = urlFrontier; mFetcher = fetcher; mSimilarContentJudger = similarContentManager; mUrlFilters = urlFilters; mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"), true); mErrorLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler Error.Log"), false); }
public static ContentParser <Uri> GetLinkParser(CrawlerConfig config, Uri parent, string content, string contentType) { switch (contentType) { case "text/html": return(new HtmlLinkParser(config.RootAddress, parent, content, config)); case "text/plain": // robots.txt return(new RobotLinkParser(config.RootAddress, parent, content, config)); case "text/xml": // sitemap.xml return(new SitemapLinkParser(config.RootAddress, parent, content, config)); default: // Unsupported content type - we still load and measure, but don't look for links return(new EmptyLinkParser <Uri>()); } }
public void Start() { PoliteWebCrawler crawler = new CrawlerConfig().CreateCrawler(); CrawlResult result = crawler.Crawl( //new Uri("https://www.komputronik.pl/category/17631/lenovo-ideapad.html")); //new Uri("https://www.komputronik.pl/category/17623/laptopy-lenovo.html")); //new Uri("https://www.komputronik.pl/category/5022/laptopy.html")); // <- ten jest spoko new Uri("https://www.komputronik.pl/category/5801/komputery-pc.html")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); Console.WriteLine("Saved successfully"); } }
private static CrawlerConfig GetComplexConfig(string address, bool verbose, bool includeImages, bool includeLinks, bool includeScripts, bool includeStyles, bool includeFailureCheck, bool includeRobots, int maxDepth, int delay, string searchExpression, string partnerSites) { var config = new CrawlerConfig { RootAddress = new Uri(address), Listener = (verbose) ? new ConsoleListener() : new SparseConsoleListener() }; if (!string.IsNullOrWhiteSpace(searchExpression)) { config.HtmlContentExpression = new Regex(searchExpression); } if (!string.IsNullOrWhiteSpace(partnerSites)) { config.PartnerSites = partnerSites.Split(',').Select(s => new Uri(s)).ToList(); } if (maxDepth > 0) { config.MaxDepth = maxDepth; } if (delay > 0) { config.CrawlDelay = TimeSpan.FromMilliseconds(delay); } config.AddCrawlerFlag(() => includeLinks, CrawlerFlags.IncludeLinks); config.AddCrawlerFlag(() => includeImages, CrawlerFlags.IncludeImages); config.AddCrawlerFlag(() => includeScripts, CrawlerFlags.IncludeScripts); config.AddCrawlerFlag(() => includeStyles, CrawlerFlags.IncludeStyles); config.AddCrawlerFlag(() => includeFailureCheck, CrawlerFlags.IncludeFailureCheck); config.AddCrawlerFlag(() => includeRobots, CrawlerFlags.IncludeRobots); return(config); }
public IHttpActionResult CreateCrawlerConfig(CrawlerConfigDataBindingModel crawlerConfigDataBindingModel) { //to do check trung route+path //List<CrawlerConfig> existedCrawlerConfigs = _db.CrawlerConfigs.Where(c => c.Route == crawlerConfigDataBindingModel.Route). if (!ModelState.IsValid) { return(BadRequest(ModelState)); } var newConfig = new CrawlerConfig() { Route = crawlerConfigDataBindingModel.Route, CategoryId = crawlerConfigDataBindingModel.CategoryId, ContentSelector = crawlerConfigDataBindingModel.ContentSelector, DescriptionSelector = crawlerConfigDataBindingModel.DescriptionSelector, LinkSelector = crawlerConfigDataBindingModel.LinkSelector, RemovalSelector = crawlerConfigDataBindingModel.RemovalSelector, Path = crawlerConfigDataBindingModel.Path, TitleSelector = crawlerConfigDataBindingModel.TitleSelector }; _db.CrawlerConfigs.Add(newConfig); _db.SaveChanges(); return(Json(newConfig)); }
static void Main(string[] args) { var cts = new CancellationTokenSource(); Console.CancelKeyPress += (sender, e) => { if (cts.IsCancellationRequested) { return; } cts.Cancel(false); e.Cancel = true; }; //ssh -L 127.0.0.1:2375:/var/run/docker.sock var logger = new LambdaLogger(Console.WriteLine); var jsonProc = new JsonProcessor(logger); var nodes = new HashSet <string>(); jsonProc.OnEot += () => { Console.WriteLine("-- EOT! --"); }; jsonProc.OnNode += (n) => { if (!nodes.Add(n.Url)) { Console.WriteLine($"!! Duplicate URL:"); } Console.WriteLine(n.Title); Console.WriteLine(n.Url); Console.WriteLine($"{nodes.Count} Nodes"); }; jsonProc.OnEdges += (e) => { Console.WriteLine($"{e.Edges.Count} Edges"); }; var baseUrl = "https://www.ichkoche.at/"; var crawlerConfig = new CrawlerConfig { FollowInternalLinks = true, CheckExternalLinks = false, MaxRequestsPerCrawl = 500, TakeScreenShots = false, RequestQueue = { baseUrl }, UrlFilter = $"{baseUrl}[.*]", MaxConcurrency = 6, }; //var env = "CRAWLER_CONFIG='" + jsonProc.Serialize(crawlerConfig) + "'"; //Console.WriteLine(env); var image = "quay.io/0xff/apify-crawler3:master"; var dockerHost = "tcp://127.0.0.1:2375/"; var si = new ProcessStartInfo { CreateNoWindow = true, FileName = "docker", ArgumentList = { "run", "--rm", "-e", "CRAWLER_CONFIG", image }, Environment = { { "DOCKER_HOST", dockerHost }, { "CRAWLER_CONFIG", jsonProc.Serialize(crawlerConfig) } }, RedirectStandardInput = true, RedirectStandardError = true, RedirectStandardOutput = true, UseShellExecute = false, }; using (var p = new Process { StartInfo = si }) { try { void DataReceived(object sender, DataReceivedEventArgs eventArgs) { if (eventArgs.Data != null) { jsonProc.ProcessMessage(eventArgs.Data); } } p.OutputDataReceived += DataReceived; p.ErrorDataReceived += DataReceived; p.Start(); p.BeginErrorReadLine(); p.BeginOutputReadLine(); while (!cts.IsCancellationRequested) { if (p.WaitForExit(1000)) { break; } } if (cts.IsCancellationRequested) { Console.WriteLine("Ctrl+C"); p.StandardInput.WriteLine("\x3"); p.StandardInput.Close(); p.StandardOutput.Close(); p.StandardError.Close(); p.WaitForExit(20000); } } finally { p.Kill(true); } Console.WriteLine($"ExitCode={p.ExitCode}"); } }
public SimpleSimilarContentManager(CrawlerConfig config) { mLogger = new RuntimeLogger(Path.Combine(config.LogDirectory, "Crawler.Log"), true); mConfig = config; }
public PageDownloader(IPageExtractor pageExtractor, CrawlerConfig config) { _pageExtractor = pageExtractor ?? throw new ArgumentNullException(nameof(IPageExtractor)); _config = config; }
public void FeedsToCrawl_Is_Never_Null() { CrawlerConfig config = new CrawlerConfig(); Assert.NotNull(config.FeedsToCrawl); }
public void TwitterUsersToCrawl_Is_Never_Null() { CrawlerConfig config = new CrawlerConfig(); Assert.NotNull(config.TwitterUsersToCrawl); }