/// <summary> /// Raises the <see cref="UpdateProgress"/> event. /// </summary> protected virtual void OnUpdateProgress(UpdateProgressEventArgs e) => UpdateProgress?.Invoke(this, e);
/// <summary> /// Uses the current setting to extract data from the Internet and write /// the data to a CSV file with the the given name. /// </summary> /// <param name="csvFile"></param> public async Task RunAsync(string csvFile) { int current = 0; int total; bool hasMorePages = false; string dataSeparator = DataSeparator ?? string.Empty; UrlsScanned = 0; UrlErrors = 0; // Verify an output file was specified if (csvFile == null) { throw new ArgumentNullException(nameof(csvFile)); } // Check for obvious errors if (string.IsNullOrWhiteSpace(Url)) { throw new Exception("A valid URL is required."); } // Parse selectors SelectorCollection containerSelectors = Selector.ParseSelector(ContainerSelector); SelectorCollection itemSelectors = Selector.ParseSelector(ItemSelector); SelectorCollection nextPageSelectors = Selector.ParseSelector(NextPageSelector); foreach (Field field in Fields) { field.Selectors = Selector.ParseSelector(field.Selector); } // Validate selectors (ContainerSelector is optional) if (!itemSelectors.Any()) { throw new Exception($"A valid {nameof(ItemSelector)} is required."); } if (Placeholder.UrlContainsPlaceholder(Url, PageIterator.PagePlaceholder) && !nextPageSelectors.Any()) { throw new Exception($"A valid {nameof(NextPageSelector)} is required when a {{page}} placeholder is used."); } // Initialize placeholder iterator PlaceholderIterator placeholderIterator = new PlaceholderIterator(Url); foreach (var placeholder in Placeholders) { placeholderIterator.Add(new PlaceholderIteratorItem(placeholder)); } total = placeholderIterator.GetTotalUrlCount(); // Initialize page iterator PageIterator pageIterator = new PageIterator(nextPageSelectors); // Initialize UpdateProgress event arguments UpdateProgressEventArgs eventArgs = new UpdateProgressEventArgs(); using (CsvWriter writer = new CsvWriter(csvFile)) { // Write column headers if (WriteColumnHeaders) { writer.WriteRow(Fields.Select(f => f.Name)); } // Scan URLs placeholderIterator.Reset(out string?url); do { pageIterator.Reset(url); do { try { // Get next URL url = pageIterator.GetCurrentPageUrl(); eventArgs.Status = $"Scanning '{url}'"; eventArgs.Percent = UpdateProgressEventArgs.CalculatePercent(current, total); OnUpdateProgress(eventArgs); // Handle cancel request if (eventArgs.Cancel) { eventArgs.Status = "Scan cancelled"; OnUpdateProgress(eventArgs); return; } // Download and parse next web page string html = await DownloadUrlAsync(url); HtmlDocument document = HtmlDocument.FromHtml(html); // Search for containers IEnumerable <HtmlElementNode> containers = (containerSelectors.Any()) ? containerSelectors.Find(document.RootNodes) : document.RootNodes.OfType <HtmlElementNode>(); IEnumerable <HtmlElementNode> nodes = itemSelectors.Find(containers); hasMorePages = pageIterator.CheckIfMorePages(document); // Search for fields in each item container foreach (HtmlElementNode node in nodes) { foreach (Field field in Fields) { IEnumerable <HtmlElementNode> matchingNodes = field.FindValue(node); field.Value = string.Join(dataSeparator, matchingNodes.Select(n => field.GetValueFromNode(n))); } writer.WriteRow(Fields.Select(f => f.Value)); } UrlsScanned++; } catch (Exception ex) { UrlErrors++; hasMorePages = false; eventArgs.Status = $"ERROR : '{url}' : {ex.Message}"; OnUpdateProgress(eventArgs); // Handle cancel request if (eventArgs.Cancel) { eventArgs.Status = "Scan cancelled"; OnUpdateProgress(eventArgs); return; } } }while (hasMorePages); current++; } while (placeholderIterator.Next(out url)); eventArgs.Status = "Scan complete"; eventArgs.Percent = 100; OnUpdateProgress(eventArgs); } }