/// <summary> /// Loads a previous state of the crawler. (does not load the webRequestFactory caches) /// </summary> /// <param name="fileName">The filename of the saved state.</param> public void LoadState(string fileName) { if (fileName == null) { throw new ArgumentNullException(nameof(fileName)); } using (WebCrawlerStateMutex.Lock()) CurrentState = Serializer.ReadJsonData <WebCrawlerState>(fileName); }
/// <summary> /// Constructs a new WebCrawler instance. Doesn't start it yet. /// </summary> /// <param name="startURL">The URL to begin crawling at.</param> /// <param name="prefixes">The valid prefixes of an URL to load (usually the page domain that you want to crawl through). ALL pages are valid if null.</param> /// <param name="onNewPage">The function to execute whenever a valid page is found.</param> /// <param name="onError">The function to execute whenever the response is invalid.</param> /// <param name="webRequestFactory">A WebRequestFactory to construct the Requests with.</param> /// <param name="threadCount">The number of worker-threads to use.</param> public WebCrawler(string startURL, string[] prefixes, Func <string, WebCrawler, bool> onNewPage, Func <Exception, bool> onError, WebRequestFactory webRequestFactory, int threadCount = 1) { if (startURL == null) { throw new ArgumentNullException(nameof(startURL)); } if (prefixes == null) { prefixes = new string[] { "" } } ; else if (prefixes.Length < 1) { throw new ArgumentOutOfRangeException(nameof(prefixes)); } for (int i = 0; i < prefixes.Length; i++) { if (prefixes[i] == null) { prefixes[i] = ""; } else { prefixes[i] = prefixes[i].Replace("http://", "").Replace("https://", "").Replace("www.", ""); } } if (onNewPage == null) { throw new ArgumentNullException(nameof(onNewPage)); } if (webRequestFactory == null) { throw new ArgumentNullException(nameof(webRequestFactory)); } if (threadCount < 1) { throw new ArgumentOutOfRangeException(nameof(threadCount)); } StartURL = startURL; Prefixes = prefixes; OnNewPage = onNewPage; if (onError == null) { OnError = (e) => true; } else { OnError = onError; } WebRequestFactory = webRequestFactory; using (WebCrawlerStateMutex.Lock()) { CurrentState = new WebCrawlerState(); CurrentState.ToGo.Add(StartURL); CurrentState.VisitedPages.Add(StartURL, true); } crawlerThreads = new Thread[threadCount]; }