/// <summary>
        ///     command to parse single file or wildcard
        /// </summary>
        /// <param name="msg">
        ///     full details of single (e.g. C:\a.html) or wildcard (e.g. C:\*.html) filespec
        /// </param>
        /// <returns>
        ///     bool to specify toActorSystem that we accepted this command
        /// </returns>
        /// <remarks>
        ///     command originates from
        ///     1. .NET caller (with callback to get progress notifications)
        ///     2. this ParseCoordinatorActor.ParseFile
        /// </remarks>
        bool BeginParse(ParseHtmlMessage msg)
        {
            var filespec = msg.Filespec;

            _Log.Info($"ParseCoordinatorActor.BeginParse({filespec}) starting");
            if (!File.Exists(filespec))
            {
                Sender.Tell(new ParsedHtmlMessage(filespec, new List <DownloadMessage>(), msg.Url, new FileNotFoundException("BeginParse cannot find file", filespec)));
                return(true);            // show ActorSystem that we tried (don't DeadLetter)
            }

            Worker myWorker = null;

            if (Workers.Count < MAXWORKERS)
            {
                var newName    = ActorNames.PARSEWORKERROOT + (++ActorNumber);  // e.g. "DownloadActor_1"
                var downloader = Context.ActorOf <ParseActor>(newName);         // parameter-less default constructor
                myWorker = new Worker(downloader);
                Workers.Add(newName, myWorker);
            }
            else
            {
                var busiest = 0;
                foreach (var wkr in Workers)
                {
                    var thisWorker = wkr.Value;
                    if (busiest < thisWorker.ActiveCount)
                    {
                        busiest  = thisWorker.ActiveCount;
                        myWorker = thisWorker;
                    }
                }
                if (busiest >= MAXBUSY)
                {
                    ToDo.Enqueue(msg);
                    return(true);            // handled (will dequeue later)
                }
            }
            TellParser(msg, myWorker);
            return(true);            // handled (passed to child actor)
        }
Esempio n. 2
0
        bool DoParse(ParseHtmlMessage msg)
        {
            var filespec = msg.Filespec.Trim();             // if null/empty the doc.Load method will abort so don't check here

            _Log.Info("ParseActor({0}).ParseHtmlMessage({1})) starting", Self.Path, filespec);

            #region HAP docs

            /*
             * // From File
             * var doc = new HtmlDocument();
             * doc.Load(filePath);
             *
             * // From String
             * var doc = new HtmlDocument();
             * doc.LoadHtml(html);
             *
             * // From Web
             * var url = "http://html-agility-pack.net/";
             * var web = new HtmlWeb();
             * var doc = web.Load(url);
             */
            #endregion            var doc = new HtmlDocument();

            var doc = new HtmlDocument();
            try
            {
                doc.Load(filespec);                                 // non-async but small-beer [local file] compared to CPU-bound parsing
            }
            catch (Exception excp)
            {
                _Log.Error("ParseActor({0}).ParseHtmlMessage({1})) exception({2}))", Self.Path, filespec, excp);
                Sender.Tell(new ParsedHtmlMessage(filespec, null, exception: excp)); // probably wasn't an HTML file
                return(true);                                                        // show ActorSystem we handled message [expect next one immediately!]
            }

            var fi = new FileInfo(filespec);
            FolderForHtml = fi.DirectoryName + Backslash;           // download *.html files into same folder [simplify a.html->b.thml->a.html nav]
            FolderNonHtml = FolderForHtml + SUBFOLDER + Backslash;  // put files for all other extensions into subfolder [created by first DownloadMessage]

            // if null or relative, we will ignore any relative Url's that we discover
            if (string.IsNullOrWhiteSpace(msg.Url))
            {
                BaseUri = null;
            }
            else
            {
                BaseUri = new Uri(msg.Url.Trim().ToLower());
                if (!BaseUri.IsAbsoluteUri)
                {
                    BaseUri = null;                 // otherwise Uri(Uri baseUri, Uri relativeUri) will ArgumentOutOfRangeException
                }
            }
            // HTML5 Specifies the base URL for all relative URLs in the page [max=1]
            var defaultbase = doc.DocumentNode.SelectSingleNode("head/base[href]");     // any HREF ? (could be solely TARGET)
            if (defaultbase != null)
            {
                var baseurl = defaultbase.Attributes["href"].Value;
                if (!string.IsNullOrWhiteSpace(baseurl))
                {
                    BaseUri = new Uri(baseurl.Trim().ToLower());
                }
            }

            //var anodes = doc.DocumentNode.SelectNodes("//a[@href]").OrderBy(n => n.Attributes["href"].Value.ToLowerInvariant());
            IEnumerable <DownloadMessage> anchors = null;
            try
            {
                anchors = (doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null))                                             // HAP returns null if nothing found
                          .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_A(nod))); // file.asp or file.aspx -> file.html
            }
            catch (Exception excp1)
            {
                _Log.Error("failed during Anchors extract ({})", excp1.Message);
                anchors = anchors ?? new List <DownloadMessage>();
            }

            /*
             * EXCLUSIONS
             *  action      <form>
             *  cite        <blockquote>, <del>, <ins>, <q>
             *  formaction	<button>, <input>
             *  href	    <a>, <area>, <base>, <link>
             *  media	    <a>, <area>, <link>, <source>, <style>
             *  muted	    <video>, <audio>
             *  src	        <audio>, <embed>, <iframe>, <img>, <input>, <script>, <source>, <track>, <video>
             *  srcset	<img>, <source>
             *  target	<a>, <area>, <base>, <form>
             *  type	<button>, <embed>, <input>, <link>, <menu>, <object>, <script>, <source>, <style>
             */
            IEnumerable <DownloadMessage> links = null;
            try
            {
                //var TEMPlinks = (doc.DocumentNode.SelectNodes("//link[@href]") ?? new HtmlNodeCollection(null))
                //   .Where(n => n.Attributes["rel"].Value != "dns - prefetch" &&    // ignore stuff in the <head/>
                //          n.Name != "form")                                        //  and don't go submit nuffin !
                //   .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Link(nod))).ToList();

                links = (doc.DocumentNode.SelectNodes("//link[@href]") ?? new HtmlNodeCollection(null))
                        .Where(n => n.Attributes["rel"].Value != "dns - prefetch" && // ignore stuff in the <head/>
                               n.Name != "form")                                     //  and don't go submit nuffin !
                        .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Link(nod)));
            }
            catch (Exception excp2)
            {
                _Log.Error("failed during Links extract ({})", excp2.Message);
                links = links ?? new List <DownloadMessage>();
            }

            IEnumerable <DownloadMessage> images = null;
            try
            {
                //var TEMPimages = (doc.DocumentNode.SelectNodes("//img[@src]") ?? new HtmlNodeCollection(null))
                //    .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Img(nod))).ToList();
                images = (doc.DocumentNode.SelectNodes("//img[@src]") ?? new HtmlNodeCollection(null))
                         .Select(nod => CombineUriToString(nod.Attributes["href"].Value, nod.Attributes["download"]?.Value, DefaultExtn_Img(nod)));
            }
            catch (Exception excp2)
            {
                _Log.Error("failed during images extract ({})", excp2.Message);
                images = images ?? new List <DownloadMessage>();
            }
#if DEBUG
            foreach (var anchor in anchors)
            {
                Console.WriteLine($"Anchor {anchor?.Url} => {anchor?.TargetPath}");
            }
            foreach (var link in links)
            {
                Console.WriteLine($"Link  {link?.Url} => {link?.TargetPath}");
            }
            foreach (var image in images)
            {
                Console.WriteLine($"Img  {image?.Url} => {image?.TargetPath}");
            }
#endif
            var dlmsgs = anchors                    // distinct and sorted List<DownloadMessage>
                         .Union(links)
                         .Union(images)
                         .Where(url => url != null)
                         .ToList();

            Sender.Tell(new ParsedHtmlMessage(filespec, dlmsgs, msg.Url));

            return(true);                                        // show ActorSystem we handled message [expect next one immediately!]
        }
        void ParseOne(string fileName, string url)
        {
            var msg = new ParseHtmlMessage(fileName, url);

            Self.Tell(msg);
        }
 /// <summary>
 ///     queue [another] parse request to specific parse actor
 /// </summary>
 /// <param name="msg">
 ///     details of parse command
 /// </param>
 /// <param name="worker">
 ///     specific Worker actor to target command
 /// </param>
 static void TellParser(ParseHtmlMessage msg, Worker worker)
 {
     worker.ActiveCount++;
     worker.ActRef.Tell(msg);
 }