Пример #1
0
 private void VisitPage()
 {
     ExtractBlocks();
     blocks.Clear();
     activePage = new RawPage();
     pages.Add(activePage);
 }
Пример #2
0
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var document = new RawDocument();

            using (var documentProcessor = new PdfDocumentProcessor())
            {
                documentProcessor.LoadDocument(request.File.FullName);
                var pages     = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages;
                var pagesList = new List <RawPage>();
                document.Pages = new RawPage[pages];
                for (var i = 1; i <= pages; i++)
                {
                    var page = new RawPage();
                    var data = GetImage(request, documentProcessor, i);
                    page.Blocks           = ocrImageParser.Parse(data).ToArray();
                    document.Pages[i - 1] = page;
                }
            }

            return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR)));
        }
Пример #3
0
        public void Run(Site site)
        {
            if (sites.Contains(site) == false)
            {
                site.Log("Site is not registered in Scraper List!");
                return;
            }

            Queue <RawPage>   rawPages = new Queue <RawPage>(); // Setup for transfer of data between each of the classes
            List <NodeResult> results  = new List <NodeResult>();

            site.Status    = SiteStatus.Downloading; // Set site status
            site.SiteStart = DateTime.Now;

            foreach (PageLayout page in site.Pages.Values)
            {
                site.Log("Downloading " + site.URL + "...", LogType.Downloader);

                DownloadResult result = downloadManager.Next(new Uri(page.URL + page.Path), page.SearchElement, page.JSExecution, page.XPathFilter, page.PageDelay);
                // Download each page and store it,

                if (result.Status.HasFlag(DownloadStatus.ErrorOccurred)) // Error checking if any errors occured let the user know and log it
                {
                    site.Log("Error occurred in " + site.URL, LogType.Downloader);
                }

                if (result.Status.HasFlag(DownloadStatus.Failed))
                {
                    site.Log("Failed to download " + site.URL + " skipped..", LogType.Downloader);
                    continue;
                }

                result.Results.ForEach((rawPage) =>
                {
                    PageDownloaded.Invoke(rawPage, EventArgs.Empty); // Invoke the event for each page downloaded
                    rawPages.Enqueue(rawPage);
                });

                site.Log("Downloaded " + site.URL + "!", LogType.Downloader);
            }

            //Console.WriteLine("|" + string.Concat(Enumerable.Repeat("-", Console.BufferWidth - 1)));

            site.Status = SiteStatus.Processing;
            while (rawPages.Count > 0)
            {
                RawPage rawPage = rawPages.Dequeue(); // Loop back over the downloaded pages and process them

                results = pageProcessor.Next(rawPage, site, downloadManager);
                PageProcessed.Invoke(results, EventArgs.Empty);

                outputPipeline.Output(results, site, rawPage.URL.LocalPath); // Take the results from page processor and pass them to the pipeline for packaging
            }

            site.Status       = SiteStatus.Finished;
            site.SiteFinished = DateTime.Now; // Stopwatch for the sites total running time
        }
Пример #4
0
 private void ExtractBlocks()
 {
     if (activePage != null)
     {
         activePage.Blocks = blocks.Select(item => item.Construct()).Where(item => item != null).ToArray();
         blocks.Clear();
         activePage = null;
     }
 }
Пример #5
0
        /// <summary>
        /// Parses a RawPage into an AllocationMapPage
        /// </summary>
        public AllocationMapPage Parse(RawPage page)
        {
            var allocationPage = new AllocationMapPage
            {
                Data      = page.Data,
                Header    = page.Header,
                StartPage = new PageAddress(page.Header.PageAddress.FileId, 0)
            };

            ParseAllocationMap(allocationPage);

            return(allocationPage);
        }
Пример #6
0
        public RawPage Read(string value)
        {
            var rawPage = new RawPage();

            var header = new Header();

            new TextHeaderReader(value).LoadHeader(header);

            rawPage.Data   = ReadData(value);
            rawPage.Header = header;

            return(rawPage);
        }
        public new IndexAllocationMapPage Parse(RawPage page)
        {
            var allocationPage = new IndexAllocationMapPage
            {
                Data   = page.Data,
                Header = page.Header
            };

            ParseAllocationMap(allocationPage);

            ParseIamHeader(allocationPage);
            ParseSinglePageSlots(allocationPage);

            return(allocationPage);
        }
Пример #8
0
        public PageFreeSpacePage Parse(RawPage page)
        {
            if (page.Header.PageType != PageType.Pfs)
            {
                throw new InvalidOperationException($"Page is not a PFS pageFreeSpacePage - {page.Header.PageType}");
            }

            var pfsPage = new PageFreeSpacePage
            {
                Header = page.Header,
                Data   = page.Data
            };

            LoadPfsBytes(pfsPage);

            return(pfsPage);
        }
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var  document     = new RawDocument();
            bool containsText = false;

            using (var documentProcessor = new PdfDocumentProcessor())
            {
                documentProcessor.LoadDocument(request.File.FullName);
                var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages;
                document.Pages = new RawPage[pages];
                for (var i = 1; i <= pages; i++)
                {
                    var page = new RawPage
                    {
                        Blocks = new[] { new TextBlockItem() }
                    };

                    page.Blocks[0].Text = documentProcessor.GetPageText(i);
                    if (!string.IsNullOrWhiteSpace(page.Blocks[0].Text))
                    {
                        containsText = true;
                    }

                    document.Pages[i - 1] = page;
                }
            }

            if (!containsText)
            {
                logger.LogInformation("Failed to find text in: [{0}]", request.File.FullName);
                return(Task.FromResult(ParsingResult.ConstructError(request)));
            }

            return(Task.FromResult(new ParsingResult(document, request, ParsingType.Extract)));
        }
Пример #10
0
        public List <NodeResult> Next(RawPage rawPage, Site site, Downloader downloader)
        {
            HtmlDocument html = new HtmlDocument();

            html.LoadHtml(rawPage.Content);

            List <NodeRequest> layouts   = site.Pages[rawPage.URL.PathAndQuery.Remove(0, 1)].Nodes;
            List <NodeResult>  htmlNodes = new List <NodeResult>();

            //site.Log("Processing, " + layouts.Count + " Nodes");
            foreach (NodeRequest request in layouts)
            {
                var nodes = html.DocumentNode.SelectNodes(request.XPath); // Search for the nodes using the the predefined xpaths

                if (nodes != null)
                {
                    NodeResult result = new NodeResult();
                    result.Property = request.Property;
                    result.Nodes    = nodes.ToList();
                    result.Site     = site;
                    result.Page     = rawPage.URL.AbsolutePath; // Creates the node result object for storing

                    if (request.Attribute != null)
                    {
                        result.Attribute = request.Attribute;
                    }


                    if (request.Recursive && request.Attribute == "href") // handles any recursive requests, allowing the page processor to download a page and reprocess it
                    {
                        var node = result.Nodes.FirstOrDefault();
                        if (node?.Attributes[request.Attribute] != null)
                        {
                            var link = node.Attributes[request.Attribute].Value;
                            site.Log("Recursive request downloading page..", LogType.Downloader);

                            if (link.Substring(0, 4) != "http")      // Attach the host address if its not inside the href attribute
                            {
                                if (link.Substring(0, 5) == "&#10;") // Specific workaround for bookings.com
                                {
                                    link = link.Substring(6);
                                }
                                link = "http://" + site.URL.Host + "/" + link;
                            }

                            var recursivePage = downloader.DownloadPage(new Uri(link));

                            var recursiveDoc = new HtmlDocument();
                            recursiveDoc.LoadHtml(recursivePage.Content);

                            var recursiveNodes = recursiveDoc.DocumentNode.SelectNodes(request.RecursiveXPath);

                            if (recursiveNodes != null) // Recursive XPath didnt find anything
                            {
                                htmlNodes.Add(new NodeResult
                                {
                                    Property  = request.Property,
                                    Attribute = request.Attribute,
                                    Nodes     = recursiveNodes.ToList(),
                                    Site      = site,
                                    Page      = rawPage.URL.AbsolutePath,
                                });
                            }
                        }
                    }
                    else
                    {
                        htmlNodes.Add(result);
                    }
                }
            }
            site.Log("Finished Processing!");
            return(htmlNodes);
        }
Пример #11
0
 public Page Parse(RawPage page)
 {
     return(null);
 }