Exemple #1
0
 /// <summary>
 /// Raises the <see cref="UpdateProgress"/> event.
 /// </summary>
 protected virtual void OnUpdateProgress(UpdateProgressEventArgs e) => UpdateProgress?.Invoke(this, e);
Exemple #2
0
        /// <summary>
        /// Uses the current setting to extract data from the Internet and write
        /// the data to a CSV file with the the given name.
        /// </summary>
        /// <param name="csvFile"></param>
        public async Task RunAsync(string csvFile)
        {
            int    current = 0;
            int    total;
            bool   hasMorePages  = false;
            string dataSeparator = DataSeparator ?? string.Empty;

            UrlsScanned = 0;
            UrlErrors   = 0;

            // Verify an output file was specified
            if (csvFile == null)
            {
                throw new ArgumentNullException(nameof(csvFile));
            }

            // Check for obvious errors
            if (string.IsNullOrWhiteSpace(Url))
            {
                throw new Exception("A valid URL is required.");
            }

            // Parse selectors
            SelectorCollection containerSelectors = Selector.ParseSelector(ContainerSelector);
            SelectorCollection itemSelectors      = Selector.ParseSelector(ItemSelector);
            SelectorCollection nextPageSelectors  = Selector.ParseSelector(NextPageSelector);

            foreach (Field field in Fields)
            {
                field.Selectors = Selector.ParseSelector(field.Selector);
            }

            // Validate selectors (ContainerSelector is optional)
            if (!itemSelectors.Any())
            {
                throw new Exception($"A valid {nameof(ItemSelector)} is required.");
            }
            if (Placeholder.UrlContainsPlaceholder(Url, PageIterator.PagePlaceholder) && !nextPageSelectors.Any())
            {
                throw new Exception($"A valid {nameof(NextPageSelector)} is required when a {{page}} placeholder is used.");
            }

            // Initialize placeholder iterator
            PlaceholderIterator placeholderIterator = new PlaceholderIterator(Url);

            foreach (var placeholder in Placeholders)
            {
                placeholderIterator.Add(new PlaceholderIteratorItem(placeholder));
            }
            total = placeholderIterator.GetTotalUrlCount();

            // Initialize page iterator
            PageIterator pageIterator = new PageIterator(nextPageSelectors);

            // Initialize UpdateProgress event arguments
            UpdateProgressEventArgs eventArgs = new UpdateProgressEventArgs();

            using (CsvWriter writer = new CsvWriter(csvFile))
            {
                // Write column headers
                if (WriteColumnHeaders)
                {
                    writer.WriteRow(Fields.Select(f => f.Name));
                }

                // Scan URLs
                placeholderIterator.Reset(out string?url);
                do
                {
                    pageIterator.Reset(url);
                    do
                    {
                        try
                        {
                            // Get next URL
                            url = pageIterator.GetCurrentPageUrl();
                            eventArgs.Status  = $"Scanning '{url}'";
                            eventArgs.Percent = UpdateProgressEventArgs.CalculatePercent(current, total);
                            OnUpdateProgress(eventArgs);
                            // Handle cancel request
                            if (eventArgs.Cancel)
                            {
                                eventArgs.Status = "Scan cancelled";
                                OnUpdateProgress(eventArgs);
                                return;
                            }

                            // Download and parse next web page
                            string html = await DownloadUrlAsync(url);

                            HtmlDocument document = HtmlDocument.FromHtml(html);

                            // Search for containers
                            IEnumerable <HtmlElementNode> containers = (containerSelectors.Any()) ?
                                                                       containerSelectors.Find(document.RootNodes) :
                                                                       document.RootNodes.OfType <HtmlElementNode>();
                            IEnumerable <HtmlElementNode> nodes = itemSelectors.Find(containers);
                            hasMorePages = pageIterator.CheckIfMorePages(document);

                            // Search for fields in each item container
                            foreach (HtmlElementNode node in nodes)
                            {
                                foreach (Field field in Fields)
                                {
                                    IEnumerable <HtmlElementNode> matchingNodes = field.FindValue(node);
                                    field.Value = string.Join(dataSeparator, matchingNodes.Select(n => field.GetValueFromNode(n)));
                                }
                                writer.WriteRow(Fields.Select(f => f.Value));
                            }

                            UrlsScanned++;
                        }
                        catch (Exception ex)
                        {
                            UrlErrors++;
                            hasMorePages     = false;
                            eventArgs.Status = $"ERROR : '{url}' : {ex.Message}";
                            OnUpdateProgress(eventArgs);
                            // Handle cancel request
                            if (eventArgs.Cancel)
                            {
                                eventArgs.Status = "Scan cancelled";
                                OnUpdateProgress(eventArgs);
                                return;
                            }
                        }
                    }while (hasMorePages);
                    current++;
                } while (placeholderIterator.Next(out url));

                eventArgs.Status  = "Scan complete";
                eventArgs.Percent = 100;
                OnUpdateProgress(eventArgs);
            }
        }