/// <summary> /// command to parse single file or wildcard /// </summary> /// <param name="msg"> /// full details of single (e.g. C:\a.html) or wildcard (e.g. C:\*.html) filespec /// </param> /// <returns> /// bool to specify toActorSystem that we accepted this command /// </returns> /// <remarks> /// command originates from /// 1. .NET caller (with callback to get progress notifications) /// 2. this ParseCoordinatorActor.ParseFile /// </remarks> bool BeginParse(ParseHtmlMessage msg) { var filespec = msg.Filespec; _Log.Info($"ParseCoordinatorActor.BeginParse({filespec}) starting"); if (!File.Exists(filespec)) { Sender.Tell(new ParsedHtmlMessage(filespec, new List <DownloadMessage>(), msg.Url, new FileNotFoundException("BeginParse cannot find file", filespec))); return(true); // show ActorSystem that we tried (don't DeadLetter) } Worker myWorker = null; if (Workers.Count < MAXWORKERS) { var newName = ActorNames.PARSEWORKERROOT + (++ActorNumber); // e.g. "DownloadActor_1" var downloader = Context.ActorOf <ParseActor>(newName); // parameter-less default constructor myWorker = new Worker(downloader); Workers.Add(newName, myWorker); } else { var busiest = 0; foreach (var wkr in Workers) { var thisWorker = wkr.Value; if (busiest < thisWorker.ActiveCount) { busiest = thisWorker.ActiveCount; myWorker = thisWorker; } } if (busiest >= MAXBUSY) { ToDo.Enqueue(msg); return(true); // handled (will dequeue later) } } TellParser(msg, myWorker); return(true); // handled (passed to child actor) }
bool DoParse(ParseHtmlMessage msg) { var filespec = msg.Filespec.Trim(); // if null/empty the doc.Load method will abort so don't check here _Log.Info("ParseActor({0}).ParseHtmlMessage({1})) starting", Self.Path, filespec); #region HAP docs /* * // From File * var doc = new HtmlDocument(); * doc.Load(filePath); * * // From String * var doc = new HtmlDocument(); * doc.LoadHtml(html); * * // From Web * var url = "http://html-agility-pack.net/"; * var web = new HtmlWeb(); * var doc = web.Load(url); */ #endregion var doc = new HtmlDocument(); var doc = new HtmlDocument(); try { doc.Load(filespec); // non-async but small-beer [local file] compared to CPU-bound parsing } catch (Exception excp) { _Log.Error("ParseActor({0}).ParseHtmlMessage({1})) exception({2}))", Self.Path, filespec, excp); Sender.Tell(new ParsedHtmlMessage(filespec, null, exception: excp)); // probably wasn't an HTML file return(true); // show ActorSystem we handled message [expect next one immediately!] } var fi = new FileInfo(filespec); FolderForHtml = fi.DirectoryName + Backslash; // download *.html files into same folder [simplify a.html->b.thml->a.html nav] FolderNonHtml = FolderForHtml + SUBFOLDER + Backslash; // put files for all other extensions into subfolder [created by first DownloadMessage] // if null or relative, we will ignore any relative Url's that we discover if (string.IsNullOrWhiteSpace(msg.Url)) { BaseUri = null; } else { BaseUri = new Uri(msg.Url.Trim().ToLower()); if (!BaseUri.IsAbsoluteUri) { BaseUri = null; // otherwise Uri(Uri baseUri, Uri relativeUri) will ArgumentOutOfRangeException } } // HTML5 Specifies the base URL for all relative URLs in the page [max=1] var defaultbase = doc.DocumentNode.SelectSingleNode("head/base[href]"); // any HREF ? (could be solely TARGET) if (defaultbase != null) { var baseurl = defaultbase.Attributes["href"].Value; if (!string.IsNullOrWhiteSpace(baseurl)) { BaseUri = new Uri(baseurl.Trim().ToLower()); } } //var anodes = doc.DocumentNode.SelectNodes("//a[@href]").OrderBy(n => n.Attributes["href"].Value.ToLowerInvariant()); IEnumerable <DownloadMessage> anchors = null; try { anchors = (doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null)) // HAP returns null if nothing found .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_A(nod))); // file.asp or file.aspx -> file.html } catch (Exception excp1) { _Log.Error("failed during Anchors extract ({})", excp1.Message); anchors = anchors ?? new List <DownloadMessage>(); } /* * EXCLUSIONS * action <form> * cite <blockquote>, <del>, <ins>, <q> * formaction <button>, <input> * href <a>, <area>, <base>, <link> * media <a>, <area>, <link>, <source>, <style> * muted <video>, <audio> * src <audio>, <embed>, <iframe>, <img>, <input>, <script>, <source>, <track>, <video> * srcset <img>, <source> * target <a>, <area>, <base>, <form> * type <button>, <embed>, <input>, <link>, <menu>, <object>, <script>, <source>, <style> */ IEnumerable <DownloadMessage> links = null; try { //var TEMPlinks = (doc.DocumentNode.SelectNodes("//link[@href]") ?? new HtmlNodeCollection(null)) // .Where(n => n.Attributes["rel"].Value != "dns - prefetch" && // ignore stuff in the <head/> // n.Name != "form") // and don't go submit nuffin ! // .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Link(nod))).ToList(); links = (doc.DocumentNode.SelectNodes("//link[@href]") ?? new HtmlNodeCollection(null)) .Where(n => n.Attributes["rel"].Value != "dns - prefetch" && // ignore stuff in the <head/> n.Name != "form") // and don't go submit nuffin ! .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Link(nod))); } catch (Exception excp2) { _Log.Error("failed during Links extract ({})", excp2.Message); links = links ?? new List <DownloadMessage>(); } IEnumerable <DownloadMessage> images = null; try { //var TEMPimages = (doc.DocumentNode.SelectNodes("//img[@src]") ?? new HtmlNodeCollection(null)) // .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Img(nod))).ToList(); images = (doc.DocumentNode.SelectNodes("//img[@src]") ?? new HtmlNodeCollection(null)) .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Img(nod))); } catch (Exception excp2) { _Log.Error("failed during images extract ({})", excp2.Message); images = images ?? new List <DownloadMessage>(); } #if DEBUG foreach (var anchor in anchors) { Console.WriteLine($"Anchor {anchor?.Url} => {anchor?.TargetPath}"); } foreach (var link in links) { Console.WriteLine($"Link {link?.Url} => {link?.TargetPath}"); } foreach (var image in images) { Console.WriteLine($"Img {image?.Url} => {image?.TargetPath}"); } #endif var dlmsgs = anchors // distinct and sorted List<DownloadMessage> .Union(links) .Union(images) .Where(url => url != null) .ToList(); Sender.Tell(new ParsedHtmlMessage(filespec, dlmsgs, msg.Url)); return(true); // show ActorSystem we handled message [expect next one immediately!] }
void ParseOne(string fileName, string url) { var msg = new ParseHtmlMessage(fileName, url); Self.Tell(msg); }
/// <summary> /// queue [another] parse request to specific parse actor /// </summary> /// <param name="msg"> /// details of parse command /// </param> /// <param name="worker"> /// specific Worker actor to target command /// </param> static void TellParser(ParseHtmlMessage msg, Worker worker) { worker.ActiveCount++; worker.ActRef.Tell(msg); }