예제 #1
0
        private bool TryParseNode <T>(HtmlNode entryNode, HtmlScrapeInstruction <T> instruction, out T result)
            where T : IScrapeResult, new()
        {
            result = new T();
            var hasProperties = false;

            foreach (var property in instruction.Configuration.EntriesConfiguration.PropertyConfigurations.Where(x => x.Selector?.Selector != null))
            {
                var node = entryNode.QuerySingle(property.Selector);
                if (node != null)
                {
                    object value     = null;
                    var    converter = property.Converter ?? _configuration.DefaultNodeConverter;
                    switch (converter)
                    {
                    case INodeConverter nodeConverter:
                        value = nodeConverter.Convert(node);
                        break;

                    case IPropertyValueConverter propertyValueConverter:
                        value = propertyValueConverter.Convert(node.InnerText);
                        break;
                    }

                    result.SetProperty(property.PropertyName, value);
                    hasProperties = true;
                }
                else
                {
                    // TODO: Log
                }
            }

            return(hasProperties);
        }
예제 #2
0
        private static IEnumerable <HtmlNode> GetEntryNodes <T>(HtmlScrapeInstruction <T> instruction, HtmlDocument doc)
            where T : IScrapeResult, new()
        {
            var entryNodes = doc.DocumentNode.QueryAll(instruction.Configuration.EntriesConfiguration.Selector);

            if (entryNodes == null || !entryNodes.Any())
            {
                // TODO: Log
                throw new ScraperException(
                          $"Query '{instruction.Configuration.EntriesConfiguration.Selector.Selector}' didn't return any results");
            }

            return(entryNodes);
        }
예제 #3
0
        public async Task <ICollection <T> > Scrape <T>(HtmlScrapeInstruction <T> instruction) where T : IScrapeResult, new()
        {
            var resultsByKey = new Dictionary <string, T>();
            var session      = instruction.StartScrapingSession(_configuration.HtmlFetcher);

            do
            {
                var html = await session.GetNextContent();

                if (html == null)
                {
                    break;
                }

                var doc = new HtmlDocument();
                doc.LoadHtml(html);

                var entryNodes = GetEntryNodes(instruction, doc).ToList();
                if (!entryNodes.Any())
                {
                    break;
                }

                bool foundNewResults = false;
                foreach (var entryNode in entryNodes)
                {
                    if (entryNode == null)
                    {
                        // TODO: Log
                        continue;
                    }

                    if (TryParseNode(entryNode, instruction, out var result) && !resultsByKey.ContainsKey(result.Key))
                    {
                        resultsByKey[result.Key] = result;
                        foundNewResults          = true;
                    }
                }

                if (!foundNewResults)
                {
                    break;
                }
            } while (true);

            return(resultsByKey.Values);
        }