private void VisitPage() { ExtractBlocks(); blocks.Clear(); activePage = new RawPage(); pages.Add(activePage); }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var document = new RawDocument(); using (var documentProcessor = new PdfDocumentProcessor()) { documentProcessor.LoadDocument(request.File.FullName); var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages; var pagesList = new List <RawPage>(); document.Pages = new RawPage[pages]; for (var i = 1; i <= pages; i++) { var page = new RawPage(); var data = GetImage(request, documentProcessor, i); page.Blocks = ocrImageParser.Parse(data).ToArray(); document.Pages[i - 1] = page; } } return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR))); }
public void Run(Site site) { if (sites.Contains(site) == false) { site.Log("Site is not registered in Scraper List!"); return; } Queue <RawPage> rawPages = new Queue <RawPage>(); // Setup for transfer of data between each of the classes List <NodeResult> results = new List <NodeResult>(); site.Status = SiteStatus.Downloading; // Set site status site.SiteStart = DateTime.Now; foreach (PageLayout page in site.Pages.Values) { site.Log("Downloading " + site.URL + "...", LogType.Downloader); DownloadResult result = downloadManager.Next(new Uri(page.URL + page.Path), page.SearchElement, page.JSExecution, page.XPathFilter, page.PageDelay); // Download each page and store it, if (result.Status.HasFlag(DownloadStatus.ErrorOccurred)) // Error checking if any errors occured let the user know and log it { site.Log("Error occurred in " + site.URL, LogType.Downloader); } if (result.Status.HasFlag(DownloadStatus.Failed)) { site.Log("Failed to download " + site.URL + " skipped..", LogType.Downloader); continue; } result.Results.ForEach((rawPage) => { PageDownloaded.Invoke(rawPage, EventArgs.Empty); // Invoke the event for each page downloaded rawPages.Enqueue(rawPage); }); site.Log("Downloaded " + site.URL + "!", LogType.Downloader); } //Console.WriteLine("|" + string.Concat(Enumerable.Repeat("-", Console.BufferWidth - 1))); site.Status = SiteStatus.Processing; while (rawPages.Count > 0) { RawPage rawPage = rawPages.Dequeue(); // Loop back over the downloaded pages and process them results = pageProcessor.Next(rawPage, site, downloadManager); PageProcessed.Invoke(results, EventArgs.Empty); outputPipeline.Output(results, site, rawPage.URL.LocalPath); // Take the results from page processor and pass them to the pipeline for packaging } site.Status = SiteStatus.Finished; site.SiteFinished = DateTime.Now; // Stopwatch for the sites total running time }
private void ExtractBlocks() { if (activePage != null) { activePage.Blocks = blocks.Select(item => item.Construct()).Where(item => item != null).ToArray(); blocks.Clear(); activePage = null; } }
/// <summary> /// Parses a RawPage into an AllocationMapPage /// </summary> public AllocationMapPage Parse(RawPage page) { var allocationPage = new AllocationMapPage { Data = page.Data, Header = page.Header, StartPage = new PageAddress(page.Header.PageAddress.FileId, 0) }; ParseAllocationMap(allocationPage); return(allocationPage); }
public RawPage Read(string value) { var rawPage = new RawPage(); var header = new Header(); new TextHeaderReader(value).LoadHeader(header); rawPage.Data = ReadData(value); rawPage.Header = header; return(rawPage); }
public new IndexAllocationMapPage Parse(RawPage page) { var allocationPage = new IndexAllocationMapPage { Data = page.Data, Header = page.Header }; ParseAllocationMap(allocationPage); ParseIamHeader(allocationPage); ParseSinglePageSlots(allocationPage); return(allocationPage); }
public PageFreeSpacePage Parse(RawPage page) { if (page.Header.PageType != PageType.Pfs) { throw new InvalidOperationException($"Page is not a PFS pageFreeSpacePage - {page.Header.PageType}"); } var pfsPage = new PageFreeSpacePage { Header = page.Header, Data = page.Data }; LoadPfsBytes(pfsPage); return(pfsPage); }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var document = new RawDocument(); bool containsText = false; using (var documentProcessor = new PdfDocumentProcessor()) { documentProcessor.LoadDocument(request.File.FullName); var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages; document.Pages = new RawPage[pages]; for (var i = 1; i <= pages; i++) { var page = new RawPage { Blocks = new[] { new TextBlockItem() } }; page.Blocks[0].Text = documentProcessor.GetPageText(i); if (!string.IsNullOrWhiteSpace(page.Blocks[0].Text)) { containsText = true; } document.Pages[i - 1] = page; } } if (!containsText) { logger.LogInformation("Failed to find text in: [{0}]", request.File.FullName); return(Task.FromResult(ParsingResult.ConstructError(request))); } return(Task.FromResult(new ParsingResult(document, request, ParsingType.Extract))); }
public List <NodeResult> Next(RawPage rawPage, Site site, Downloader downloader) { HtmlDocument html = new HtmlDocument(); html.LoadHtml(rawPage.Content); List <NodeRequest> layouts = site.Pages[rawPage.URL.PathAndQuery.Remove(0, 1)].Nodes; List <NodeResult> htmlNodes = new List <NodeResult>(); //site.Log("Processing, " + layouts.Count + " Nodes"); foreach (NodeRequest request in layouts) { var nodes = html.DocumentNode.SelectNodes(request.XPath); // Search for the nodes using the the predefined xpaths if (nodes != null) { NodeResult result = new NodeResult(); result.Property = request.Property; result.Nodes = nodes.ToList(); result.Site = site; result.Page = rawPage.URL.AbsolutePath; // Creates the node result object for storing if (request.Attribute != null) { result.Attribute = request.Attribute; } if (request.Recursive && request.Attribute == "href") // handles any recursive requests, allowing the page processor to download a page and reprocess it { var node = result.Nodes.FirstOrDefault(); if (node?.Attributes[request.Attribute] != null) { var link = node.Attributes[request.Attribute].Value; site.Log("Recursive request downloading page..", LogType.Downloader); if (link.Substring(0, 4) != "http") // Attach the host address if its not inside the href attribute { if (link.Substring(0, 5) == " ") // Specific workaround for bookings.com { link = link.Substring(6); } link = "http://" + site.URL.Host + "/" + link; } var recursivePage = downloader.DownloadPage(new Uri(link)); var recursiveDoc = new HtmlDocument(); recursiveDoc.LoadHtml(recursivePage.Content); var recursiveNodes = recursiveDoc.DocumentNode.SelectNodes(request.RecursiveXPath); if (recursiveNodes != null) // Recursive XPath didnt find anything { htmlNodes.Add(new NodeResult { Property = request.Property, Attribute = request.Attribute, Nodes = recursiveNodes.ToList(), Site = site, Page = rawPage.URL.AbsolutePath, }); } } } else { htmlNodes.Add(result); } } } site.Log("Finished Processing!"); return(htmlNodes); }
public Page Parse(RawPage page) { return(null); }