/// <summary> /// Constructs a new WebCrawler instance. Doesn't start it yet. /// </summary> /// <param name="startURL">The URL to begin crawling at.</param> /// <param name="prefixes">The valid prefixes of an URL to load (usually the page domain that you want to crawl through). ALL pages are valid if null.</param> /// <param name="onNewPage">The function to execute whenever a valid page is found.</param> /// <param name="onError">The function to execute whenever the response is invalid.</param> /// <param name="webRequestFactory">A WebRequestFactory to construct the Requests with.</param> /// <param name="threadCount">The number of worker-threads to use.</param> public WebCrawler(string startURL, string[] prefixes, Func <string, WebCrawler, bool> onNewPage, Func <Exception, bool> onError, WebRequestFactory webRequestFactory, int threadCount = 1) { if (startURL == null) { throw new ArgumentNullException(nameof(startURL)); } if (prefixes == null) { prefixes = new string[] { "" } } ; else if (prefixes.Length < 1) { throw new ArgumentOutOfRangeException(nameof(prefixes)); } for (int i = 0; i < prefixes.Length; i++) { if (prefixes[i] == null) { prefixes[i] = ""; } else { prefixes[i] = prefixes[i].Replace("http://", "").Replace("https://", "").Replace("www.", ""); } } if (onNewPage == null) { throw new ArgumentNullException(nameof(onNewPage)); } if (webRequestFactory == null) { throw new ArgumentNullException(nameof(webRequestFactory)); } if (threadCount < 1) { throw new ArgumentOutOfRangeException(nameof(threadCount)); } StartURL = startURL; Prefixes = prefixes; OnNewPage = onNewPage; if (onError == null) { OnError = (e) => true; } else { OnError = onError; } WebRequestFactory = webRequestFactory; using (WebCrawlerStateMutex.Lock()) { CurrentState = new WebCrawlerState(); CurrentState.ToGo.Add(StartURL); CurrentState.VisitedPages.Add(StartURL, true); } crawlerThreads = new Thread[threadCount]; }
/// <summary> /// Constructs a new WebCrawler instance. Doesn't start it yet. /// </summary> /// <param name="startURL">The URL to begin crawling at.</param> /// <param name="prefix">The valid prefix of an URL to load (usually the page domain that you want to crawl through).</param> /// <param name="onNewPage">The function to execute whenever a valid page is found. Returns if the links inside this page should be visited.</param> /// <param name="onError">The function to execute whenever the response is invalid. Returns if the WebCrawler should continue running.</param> /// <param name="webRequestFactory">A WebRequestFactory to construct the Requests with.</param> /// <param name="threadCount">The number of worker-threads to use.</param> public WebCrawler(string startURL, string prefix, Func <string, WebCrawler, bool> onNewPage, Func <Exception, bool> onError, WebRequestFactory webRequestFactory, int threadCount = 1) : this(startURL, new string[] { prefix }, onNewPage, onError, webRequestFactory, threadCount) { }