public Dictionary <string, MyPurdueSection> ParseHtml(string content)
        {
            // Prepare section list
            var             sections = new Dictionary <string, MyPurdueSection>();
            MyPurdueSection section  = null;

            // Check if we didn't return any classes
            if (content.Contains("No classes were found that meet your search criteria"))
            {
                return(sections);
            }

            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(content);
            HtmlNode docRoot = document.DocumentNode;

            HtmlNodeCollection sectionNodes = docRoot.SelectNodes("/html/body/div[@class='pagebodydiv'][1]/table[@class='datadisplaytable'][1]/tr[ not ( th ) ]");

            if (sectionNodes == null)
            {
                throw new ApplicationException("Could not parse data from section details request.");
            }

            // Loop through table rows
            for (var i = 0; i < sectionNodes.Count; i++)
            {
                var node    = sectionNodes[i];
                var crnNode = node.SelectSingleNode("td[2]");
                if (crnNode == null)
                {
                    continue;                                  // No node? Skip...
                }
                // Each row is a section AND/OR meeting.
                // If there's a CRN in this row, it means that we're looking at a new section.
                if (HtmlEntity.DeEntitize(crnNode.InnerText).Trim().Length > 0)
                {
                    // Section w/ primary meeting data
                    var crnNumber = HtmlEntity.DeEntitize(crnNode.InnerText).Trim();
                    section = new MyPurdueSection()
                    {
                        Crn              = crnNumber,
                        SubjectCode      = HtmlEntity.DeEntitize(node.SelectSingleNode("td[3]").InnerText).Trim(),
                        Number           = HtmlEntity.DeEntitize(node.SelectSingleNode("td[4]").InnerText).Trim(),
                        SectionCode      = HtmlEntity.DeEntitize(node.SelectSingleNode("td[5]").InnerText).Trim(),
                        CampusCode       = HtmlEntity.DeEntitize(node.SelectSingleNode("td[6]").InnerText).Trim(),
                        Title            = HtmlEntity.DeEntitize(node.SelectSingleNode("td[8]").InnerText).Trim(),
                        Capacity         = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[11]").InnerText).Trim()),
                        Enrolled         = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[12]").InnerText).Trim()),
                        RemainingSpace   = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[13]").InnerText).Trim()),
                        WaitlistCapacity = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[14]").InnerText).Trim()),
                        WaitlistCount    = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[15]").InnerText).Trim()),
                        WaitlistSpace    = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[16]").InnerText).Trim()),
                        Type             = HtmlEntity.DeEntitize(node.SelectSingleNode("td[23]").InnerText).Trim(),
                        Description      = HtmlEntity.DeEntitize(node.SelectSingleNode("td[26]").InnerText).Trim(),
                        Meetings         = new List <MyPurdueMeeting>()
                    };

                    // Deal with credit hours...
                    var credits = HtmlEntity.DeEntitize(node.SelectSingleNode("td[7]").InnerText).Trim();
                    if (credits.Contains("-"))
                    {
                        credits = credits.Substring(credits.IndexOf("-") + 1);
                    }
                    else if (credits.Contains("/"))
                    {
                        credits = credits.Substring(credits.IndexOf("/") + 1);
                    }
                    section.CreditHours = double.Parse(credits);

                    sections.Add(crnNumber, section);
                }

                // Now, update meeting data for this row
                var meeting = new MyPurdueMeeting();

                // Update meeting days of the week
                // Parse days of week
                var daysOfWeek = HtmlEntity.DeEntitize(node.SelectSingleNode("td[9]").InnerText).Trim();
                meeting.DaysOfWeek = ParseUtility.ParseDaysOfWeek(daysOfWeek);

                // Parse times
                var times         = HtmlEntity.DeEntitize(node.SelectSingleNode("td[10]").InnerText).Trim();
                var startEndTimes = ParseUtility.ParseStartEndTime(times, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time"));                 // TODO: Not hard-code time zone
                meeting.StartTime = startEndTimes.Item1;
                meeting.EndTime   = startEndTimes.Item2;

                // Parse dates (removed - no year present, not reliable)
                //var dates = HtmlEntity.DeEntitize(node.SelectSingleNode("td[21]").InnerText);
                //var startEndDates = ParseUtility.ParseStartEndDate(dates, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time")); // TODO: Not hard-code time zone
                //meeting.StartDate = startEndDates.Item1;
                //meeting.EndDate = startEndDates.Item2;

                // Update meeting location (building short name)
                var loc = HtmlEntity.DeEntitize(node.SelectSingleNode("td[22]").InnerText).Trim();
                if (loc.Equals("TBA"))
                {
                    meeting.BuildingCode = "TBA";
                    meeting.BuildingName = "TBA";
                    meeting.RoomNumber   = "TBA";
                }
                else if (loc.Length > 0)
                {
                    if (loc.Contains(" "))
                    {
                        meeting.BuildingCode = loc.Substring(0, loc.IndexOf(" ")).Trim();
                        meeting.RoomNumber   = loc.Substring(loc.IndexOf(" ") + 1).Trim();
                    }
                    else
                    {
                        meeting.BuildingCode = loc;
                        meeting.RoomNumber   = "";
                    }
                }
                else
                {
                    throw new ApplicationException("Could not parse location data for section CRN " + section.Crn + ".");
                }

                // Updating meeting type
                meeting.Type = HtmlEntity.DeEntitize(node.SelectSingleNode("td[23]").InnerText).Trim();

                // Add the meeting
                section.Meetings.Add(meeting);
            }

            return(sections);
        }
Exemple #2
0
        private static void ConvertTo(string startUrl, HtmlNode node, TextWriter outText)
        {
            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(startUrl, node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                if (html.Trim().Length > 0)
                {
                    outText.WriteLine(HtmlEntity.DeEntitize(html));
                }
                break;

            case HtmlNodeType.Element:
                switch (node.Name)
                {
                case "title":
                    outText.WriteLine($"<u>{node.InnerText}</u>");
                    break;

                case "p":
                    // treat paragraphs as crlf
                    outText.Write("\r\n");
                    break;

                case "a":
                    string linkTarget = node.GetAttributeValue("href", "");
                    if (!linkTarget.StartsWith("http"))
                    {
                        linkTarget = startUrl + linkTarget;
                    }
                    if (linkTarget != "")
                    {
                        outText.Write($"<exec cmd=\"!wb {linkTarget}\">{node.InnerText}</exec>");
                    }
                    break;
                }

                if (node.HasChildNodes)
                {
                    ConvertContentTo(startUrl, node, outText);
                }
                break;
            }
        }
Exemple #3
0
        public Idiom ParseIdiomPage(string relativeUrl)
        {
            var absoluteUrl = WikitionaryParser.WikitionaryRootUrl + relativeUrl;
            var document    = _web.Load(absoluteUrl).DocumentNode;

            // delete all nodes for other sections than English
            var nodesToRemove = document.SelectNodes("//hr/following-sibling::*");

            if (nodesToRemove != null)
            {
                foreach (var nodeToRemove in nodesToRemove)
                {
                    nodeToRemove.Remove();
                }
            }

            // name
            var name = HtmlEntity.DeEntitize(document.SelectSingleNode("//h1[@id='firstHeading']").InnerText.Trim());

            // usages
            var usages = new List <Usage>();
            var relevantUsageSections = document.SelectNodes("//h3/span[@class='mw-headline']");

            if (relevantUsageSections != null)
            {
                foreach (var relevantUsageSection in relevantUsageSections.Where(s => !H3HeadLinesToIgnore.Contains(s.InnerText.Trim())))
                {
                    var olNode = relevantUsageSection.SelectSingleNode("./../following-sibling::ol");
                    var definitionsAndExamples = new List <DefinitionAndExamples>();
                    var defNodes = olNode.SelectNodes("./li");
                    foreach (var defNode in defNodes)
                    {
                        var clone            = defNode.CloneNode(true);
                        var childrenToRemove = clone.SelectNodes("./dl|./ul");
                        if (childrenToRemove != null)
                        {
                            foreach (var childToRemove in childrenToRemove)
                            {
                                clone.RemoveChild(childToRemove);
                            }
                        }
                        var definition = clone.InnerText.Trim();

                        var examples = defNode.SelectNodes("./dl/dd") != null
                            ? defNode.SelectNodes("./dl/dd").Select(exNode => HtmlEntity.DeEntitize(exNode.InnerText.Trim())).ToList()
                            : new List <string>();

                        var quotes = defNode.SelectNodes("./ul/li//dd") != null
                            ? defNode.SelectNodes("./ul/li//dd").Select(ddNode => HtmlEntity.DeEntitize(ddNode.InnerText.Trim())).ToList()
                            : new List <string>();

                        definitionsAndExamples.Add(new DefinitionAndExamples()
                        {
                            Definition = definition,
                            Examples   = examples,
                            Quotes     = quotes
                        });
                    }

                    var usage = new Usage()
                    {
                        DefinitionsAndExamples = definitionsAndExamples,
                        PartOfSpeech           = HtmlEntity.DeEntitize(relevantUsageSection.InnerText.Trim())
                    };
                    usages.Add(usage);
                }
            }

            // synonyms
            var synonyms     = new List <string>();
            var syonoymNodes = document.SelectNodes("//span[@id='Synonyms']/../following-sibling::ul//a");

            if (syonoymNodes != null)
            {
                synonyms = syonoymNodes
                           .Select(a => HtmlEntity.DeEntitize(a.InnerText.Trim()))
                           .ToList();
            }

            // Categories
            var categories = document.SelectNodes("//div[@id='mw-normal-catlinks']/ul/li/a")
                             .Select(n => n.InnerText.Trim())
                             .ToList();

            return(new Idiom()
            {
                Name = name,
                Synonyms = synonyms,
                SourceRelativeUrl = relativeUrl,
                Categories = categories,
                Usages = usages
            });
        }
        private static void ConvertTo(HtmlNode node, TextWriter outText)
        {
            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                string html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                var trimmedHtml = html
                                  .Trim('\n')
                                  .Trim('\r')
                                  .Trim();
                if (trimmedHtml.Length > 0)
                {
                    string deEntitized;
                    try
                    {
                        deEntitized = HtmlEntity.DeEntitize(trimmedHtml.ToLowerInvariant());
                    }
                    catch (Exception)
                    {
                        deEntitized = "?";
                    }

                    outText.Write(deEntitized);
                    outText.Write(" ");
                }
                break;

            case HtmlNodeType.Element:
                switch (node.Name)
                {
                case "p":
                    // treat paragraphs as crlf
                    outText.Write("\r\n");
                    break;
                }

                if (node.HasChildNodes)
                {
                    ConvertContentTo(node, outText);
                }

                break;
            }
        }
        public async Task <IList <Beer> > Fetch()
        {
            var beers = new List <Beer>();

            Console.WriteLine("Scraping Cloudwater FFB...");
            Console.WriteLine();

            var client = new HttpClient();

            var breweriesPage = await ScrapeHelper.FetchParseAsync("https://www.friendsandfamily.beer/family");

            var breweryNodes = breweriesPage.QuerySelectorAll("h2 > a");
            var breweryCount = breweryNodes.Count();

            Console.WriteLine($"Found {breweryCount} breweries...");
            Console.WriteLine();

            foreach (var breweryNode in breweryNodes)
            {
                var href = breweryNode.Attributes["href"]?.Value;
                if (string.IsNullOrWhiteSpace(href))
                {
                    continue;
                }

                var breweryName = CultureInfo.CurrentCulture.TextInfo.ToTitleCase(
                    breweryNode.InnerText.Trim()?.ToLower()
                    );

                Console.WriteLine(breweryName);
                Console.WriteLine("------------------------------");

                var beersPage = await ScrapeHelper.FetchParseAsync(href);

                var beerNodes = beersPage.QuerySelectorAll("ul > li > p");

                if (beerNodes.Count() == 0)
                {
                    Console.WriteLine("No beers found (yet)");
                }
                else
                {
                    foreach (var beerNode in beerNodes)
                    {
                        string beerName    = null;
                        string description = null;

                        // Style: <strong>beer name</strong> description <strong>(v)</strong>
                        if (beerNode.ChildNodes[0].Name?.ToLower() == "strong")
                        {
                            beerName    = beerNode.ChildNodes[0]?.InnerText?.Trim();
                            description = beerNode.ChildNodes[1]?.InnerText?.Trim();
                        }
                        // Style: beer name, description <strong>(v)</strong>
                        else
                        {
                            var parts = beerNode.ChildNodes[0]?.InnerText?.Trim()?.Split(',');
                            beerName    = parts[0]?.Trim();
                            description = string.Join(',', parts.Skip(1));
                        }

                        description = description?.TrimStart(' ', ',');

                        Console.WriteLine($"{beerName} ----- {description}");

                        beers.Add(new Beer
                        {
                            BreweryName = breweryName != null ? HtmlEntity.DeEntitize(breweryName) : null,
                            BeerName    = beerName != null ? HtmlEntity.DeEntitize(beerName)    : null,
                            Description = description != null ? HtmlEntity.DeEntitize(description) : null
                        });
                    }
                }

                Console.WriteLine();
                await Task.Delay(1000);
            }

            Console.WriteLine($"Found {beers.Count} beers.");

            return(beers);
        }
Exemple #6
0
        public List <ListTemp> scrap(Shelter_ID.id shelter_id)
        {
            HtmlWeb         web_page     = new HtmlWeb();
            List <ListTemp> listTemp     = new List <ListTemp>();
            DateTime        today        = DateTime.Today;
            string          _name        = "";
            string          _description = "";
            string          _breed       = "";
            string          _gender      = "";
            string          _age         = "";
            string          _weight      = "";
            DateTime        _dateStart   = DateTime.Today;



            string url = @"http://www.napaluchu.waw.pl/czekam_na_ciebie/wszystkie_zwierzeta_do_adopcji:1";

            var  doc     = web_page.Load(url);
            int  number  = 1;
            bool allList = false;

            while (!allList)
            {
                var nextPage = doc.DocumentNode.SelectNodes("//div[@class = 'pagination']/a[@class = 'next']");
                number++;
                url = "http://www.napaluchu.waw.pl/czekam_na_ciebie/wszystkie_zwierzeta_do_adopcji:" + number;

                var animal_link = doc.DocumentNode.SelectNodes("//a[@class = 'animals_btn_list_more']/@href").Select(q => q.GetAttributeValue("href", null)).ToList();

                for (int i = 0; i < animal_link.Count(); i++)
                {
                    List <byte[]> _photo     = new List <byte[]>();
                    var           animal_doc = web_page.Load(@"http://www.napaluchu.waw.pl" + animal_link[i]);

                    //--INFO
                    var nodeInfo = animal_doc.DocumentNode.SelectNodes("//div[@class = 'info']")[0].InnerText.Replace("\r", "").Replace("\n", "").Trim();
                    nodeInfo = HtmlEntity.DeEntitize(nodeInfo).Trim();
                    var tempInfo = nodeInfo.Split(':');

                    for (int t = 0; t < tempInfo.Count(); t++)
                    {
                        if (tempInfo[t].Contains("Gatunek"))
                        {
                            _name = tempInfo[t].Replace("Gatunek", "").Trim();
                        }
                        if (tempInfo[t].Contains("Płeć"))
                        {
                            _breed = tempInfo[t].Replace("Płeć", "").Trim();
                        }
                        if (tempInfo[t].Contains("Wiek"))
                        {
                            _gender = tempInfo[t].Replace("Wiek", "").Trim();
                        }
                        if (tempInfo[t].Contains("Waga"))
                        {
                            _age = tempInfo[t].Replace("Waga", "").Replace("lat", "").Replace("rok", "").Trim();
                        }
                        if (tempInfo[t].Contains("Data przyjęcia"))
                        {
                            _weight = tempInfo[t].Replace("Data przyjęcia", "").Trim();
                        }
                        if (tempInfo[t].Contains("ewidencyjny"))
                        {
                            var year  = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[2]);
                            var month = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[1]);
                            var day   = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[0]);
                            _dateStart = new DateTime(year, month, day);
                        }
                    }//--INFO

                    //--Description
                    _description = "";

                    var nodeDescription = animal_doc.DocumentNode.SelectNodes("//div[@class = 'description']").Select(q => q.InnerText).ToList();

                    for (int d = 0; d < nodeDescription.Count(); d++)
                    {
                        _description += nodeDescription[d];
                    }
                    _description = HtmlEntity.DeEntitize(_description).Replace("\r", " ").Replace("\n", "").Trim();
                    //--Description

                    //--Photo
                    var node_Photo = animal_doc.DocumentNode.SelectNodes("//div[@class = 'ani_images']/div[@class = 'ani_image_bottom']/a");
                    if (node_Photo != null)
                    {
                        var nodePhoto = node_Photo.Select(q => q.GetAttributeValue("href", null)).ToList();
                        var photoLink = @"http://www.napaluchu.waw.pl";

                        for (int p = 0; p < nodePhoto.Count(); p++)
                        {
                            using (var client = new WebClient())
                            {
                                _photo.Add(client.DownloadData(photoLink + nodePhoto[p]));
                            }
                            if (p == 4)
                            {
                                break;
                            }
                        }
                    }

                    //--Photo

                    listTemp.Add(new ListTemp()
                    {
                        name        = _name,
                        breed       = _breed,
                        gender      = _gender,
                        age         = _age,
                        weight      = _weight,
                        description = _description,
                        dateStart   = _dateStart,
                        shelter_ID  = shelter_id.ID,
                        photo       = _photo,
                    });
                }

                doc = web_page.Load(url);
                if (nextPage == null)
                {
                    allList = true;
                }
            }


            return(listTemp);
        }
Exemple #7
0
        public static string GetNextUri(HtmlDocument currentPage, string messageText)
        {
            var a = currentPage.DocumentNode.SelectNodes("//a").Where(x =>
                                                                      string.Equals(messageText, HtmlEntity.DeEntitize(x.InnerText).Trim(),
                                                                                    StringComparison.InvariantCultureIgnoreCase))
                    .First();

            string uri = a.GetAttributeValue("href", def: null);

            uri = "http://y.20q.net" + uri;
            return(uri);
        }
Exemple #8
0
        /// <summary>
        ///   Parses multiple <see cref="HtmlTag" />s from the given <paramref name="htmlContent" />
        /// </summary>
        /// <param name="htmlContent">The html content</param>
        /// <param name="validateSyntax">A value indicating whether the html should be checked for syntax errors.</param>
        /// <returns>A collection of <see cref="HtmlTag" /></returns>
        /// <exception cref="InvalidOperationException">
        ///   If <paramref name="validateSyntax" /> is true and syntax errors are
        ///   encountered in the <paramref name="htmlContent" />
        /// </exception>
        public static IEnumerable <IHtmlElement> ParseAll(IHtmlContent htmlContent, bool validateSyntax = false)
        {
            if (htmlContent == null)
            {
                throw new ArgumentNullException(nameof(htmlContent));
            }
            // special case: html content is already an HtmlTag!
            if (htmlContent is HtmlTag alreadyHtmlTag)
            {
                return(new[] { alreadyHtmlTag });
            }
            // special case: string that may contain HTML but must be encoded when writing
            if (htmlContent is StringHtmlContent s)
            {
                return(new[] { new HtmlText(s) });
            }
            // special case: TagBuilder
            if (htmlContent is TagBuilder tagBuilder)
            {
                var htmlTag = new HtmlTag(tagBuilder.TagName)
                              .WithTagRenderMode(tagBuilder.TagRenderMode);

                if (tagBuilder.Attributes.Any())
                {
                    htmlTag = tagBuilder.Attributes
                              .Aggregate(htmlTag,
                                         (tag, attribute) => tag.Attribute(attribute.Key, HtmlEntity.DeEntitize(attribute.Value)));
                }

                if (tagBuilder.HasInnerHtml)
                {
                    htmlTag = htmlTag.WithContents(ParseAll(tagBuilder.InnerHtml, validateSyntax).ToImmutableList());
                }

                return(new[] { htmlTag });
            }
            return(ParseAll(htmlContent.ToHtmlString(), validateSyntax));
        }
        /**************************************************************************/

        public static string MakeUrlAbsolute(
            string BaseUrl,
            string Url
            )
        {
            string UrlFixed;
            Uri    BaseUri     = null;
            string BaseUriPort = "";
            Uri    NewUri      = null;

            Regex reHTTP              = new Regex("^https?:");
            Regex reDoubleSlash       = new Regex("^//");
            Regex reSlash             = new Regex("^/");
            Regex reQuery             = new Regex("^\\?");
            Regex reHash              = new Regex("^#");
            Regex reUnsupportedScheme = new Regex("^[^:]+:");

            BaseUrl = HtmlEntity.DeEntitize(BaseUrl);
            BaseUrl = Uri.UnescapeDataString(BaseUrl);

            Url = HtmlEntity.DeEntitize(Url);
            Url = Uri.UnescapeDataString(Url);

            try
            {
                BaseUri = new Uri(BaseUrl, UriKind.Absolute);

                if (BaseUri.Port > 0)
                {
                    BaseUriPort = string.Format(":{0}", BaseUri.Port);
                }
            }
            catch (UriFormatException ex)
            {
                DebugMsgStatic(string.Format("MakeUrlAbsolute: {0}", ex.Message));
            }
            catch (Exception ex)
            {
                DebugMsgStatic(string.Format("MakeUrlAbsolute: {0}", ex.Message));
            }

            if (BaseUri == null)
            {
                throw new MacroscopeUriFormatException("Malformed Base URI");
            }

            if (!Regex.IsMatch(Url, "^(https?:|/|#)"))
            {
                DebugMsgStatic(string.Format("STRANGE URL: 1: {0}", BaseUrl));
                DebugMsgStatic(string.Format("STRANGE URL: 2: {0}", Url));
            }

            if (!reHTTP.IsMatch(Url))
            {
                bool IsSuspect = false;
                if (
                    (!reDoubleSlash.IsMatch(Url)) &&
                    (!reSlash.IsMatch(Url)) &&
                    (!reQuery.IsMatch(Url)) &&
                    (!reHash.IsMatch(Url)))
                {
                    if (reUnsupportedScheme.IsMatch(Url))
                    {
                        IsSuspect = true;
                    }
                }
                if (IsSuspect)
                {
                    DebugMsgStatic(string.Format("STRANGE URL: IS SUSPECT: {0}", Url));
                    return(null);
                }
            }

            if (reDoubleSlash.IsMatch(Url))
            {
                try
                {
                    NewUri = new Uri(
                        string.Format(
                            "{0}:{1}",
                            BaseUri.Scheme,
                            Url
                            ),
                        UriKind.Absolute
                        );
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
            }
            else
            if (reSlash.IsMatch(Url))
            {
                try
                {
                    NewUri = new Uri(
                        string.Format(
                            "{0}://{1}{2}{3}",
                            BaseUri.Scheme,
                            BaseUri.Host,
                            BaseUriPort,
                            Url
                            ),
                        UriKind.Absolute
                        );
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
            }
            else
            if (reQuery.IsMatch(Url))
            {
                try
                {
                    NewUri = new Uri(
                        string.Format(
                            "{0}://{1}{2}{3}{4}",
                            BaseUri.Scheme,
                            BaseUri.Host,
                            BaseUriPort,
                            BaseUri.AbsolutePath,
                            Url
                            ),
                        UriKind.Absolute
                        );
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
            }
            else
            if (reHash.IsMatch(Url))
            {
                string NewUrl       = Url;
                Regex  reHashRemove = new Regex("#.*$", RegexOptions.Singleline);
                NewUrl = reHashRemove.Replace(NewUrl, "");

                try
                {
                    NewUri = new Uri(
                        string.Format(
                            "{0}://{1}{2}{3}",
                            BaseUri.Scheme,
                            BaseUri.Host,
                            BaseUriPort,
                            NewUrl
                            ),
                        UriKind.Absolute
                        );
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
            }
            else
            if (reHTTP.IsMatch(Url))
            {
                try
                {
                    NewUri = new Uri(Url, UriKind.Absolute);
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
            }
            else
            if (reUnsupportedScheme.IsMatch(Url))
            {
                ; // NO-OP, for now.
            }
            else
            {
                DebugMsgStatic(string.Format("RELATIVE URL 1: {0}", Url));

                string BasePath = Regex.Replace(BaseUri.AbsolutePath, "/[^/]+$", "/");
                string NewPath  = string.Join("", BasePath, Url);

                DebugMsgStatic(string.Format("RELATIVE URL 2: {0}", BasePath));
                DebugMsgStatic(string.Format("RELATIVE URL 3: {0}", NewPath));

                try
                {
                    NewUri = new Uri(
                        string.Format(
                            "{0}://{1}{2}{3}",
                            BaseUri.Scheme,
                            BaseUri.Host,
                            BaseUriPort,
                            NewPath
                            ),
                        UriKind.Absolute
                        );
                }
                catch (InvalidOperationException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
                catch (UriFormatException ex)
                {
                    DebugMsgStatic(ex.Message);
                }
            }

            if (NewUri != null)
            {
                UrlFixed = NewUri.ToString();
            }
            else
            {
                UrlFixed = Url;
            }

            UrlFixed = SanitizeUrl(UrlFixed);

            return(UrlFixed);
        }
Exemple #10
0
        public async Task <IEnumerable <PublicTransportTrip> > SearchAsync(TripSearchRequest request)
        {
            var client = new HttpClient();

            var startResponse = (await client.GetAsync(options.Host))
                                .EnsureSuccessStatusCode();
            var startHtml = new HtmlDocument();

            startHtml.LoadHtml(await startResponse.Content.ReadAsStringAsync());

            var formNode   = startHtml.DocumentNode.SelectSingleNode("//form[@id='HFSQuery']");
            var formAction = formNode.Attributes["action"].Value;
            int queryId    = int.Parse(queryIdRegex.Match(formAction).Groups["id"].Value);

            var requestContent = options.BuildSearch(request);

            var response = (await client.PostAsync(
                                options.SearchURI(queryId),
                                new FormUrlEncodedContent(requestContent)))
                           .EnsureSuccessStatusCode();
            var resultHtml = new HtmlDocument();

            resultHtml.LoadHtml(await response.Content.ReadAsStringAsync());

            var resultTable = resultHtml.DocumentNode.SelectSingleNode("//table[@class='resultTable']");

            if (resultTable == null)
            {
                throw new InvalidOperationException("No trips could be found!");
            }

            var result = new List <PublicTransportTrip>();

            foreach (var row in resultTable.SelectNodes("//tr[starts-with(@id, 'trOverview') and not(starts-with(@id, 'trOverviewHint') )]"))
            {
                var builder = PublicTransportTrip.NewBuilder(Operator);

                DateTime startDate = DateTime.MinValue;
                TimeSpan startTime = TimeSpan.MinValue;
                DateTime endDate   = DateTime.MinValue;
                TimeSpan endTime   = TimeSpan.MinValue;

                foreach (var column in row.SelectNodes("td"))
                {
                    switch (column.Attributes["headers"]?.Value)
                    {
                    case "hafasOVStop":
                        var startStop = GetTextContent(column.FirstChild);
                        var endStop   = GetTextContent(column.LastChild);
                        builder.SetStartLocation(new Destination(startStop, 0, 0));
                        builder.SetEndLocation(new Destination(endStop, 0, 0));
                        break;

                    case "hafasOVDate":
                        startDate = DateTime.ParseExact(column.InnerText, "dd.MM.yyyy", CultureInfo.InvariantCulture);
                        endDate   = startDate;   // TODO Could go over 2 days
                        break;

                    case "hafasOVTime":
                        var      planned = column.SelectSingleNode("div/div[@class='planed']").InnerText;
                        string[] lines   = planned.Trim().Split('\n');

                        startTime = TimeSpan.Parse(lines[0].Replace("ab", ""));
                        endTime   = TimeSpan.Parse(lines[1].Replace("an", ""));
                        break;

                    case "hafasOVDuration":
                        var duration = TimeSpan.Parse(column.InnerText);
                        break;

                    case "hafasOVChanges":
                        break;

                    case "hafasOVProducts":
                        builder.AddType(PublicTransportType.Train);     // TODO
                        break;

                    default:
                        break;
                    }
                }
                builder.SetStartTime(startDate + startTime);
                builder.SetEndTime(endDate + endTime);

                result.Add(builder.Build());
            }

            return(result);

            string GetTextContent(HtmlNode node)
            {
                string text = node.InnerText;

                text = HtmlEntity.DeEntitize(text);
                return(text.Trim());
            }
        }
Exemple #11
0
 private string DecodeValue(string value)
 {
     return(HtmlEntity.DeEntitize(value));
 }
        public string exchangeStrings(string org)
        {
            string ret;
            byte[] toAry = new byte[2];
            string to;

            StringBuilder sb = new StringBuilder(HtmlEntity.DeEntitize(org));

            foreach (structExchangeTable table in exchangeTable)
            {
                if (table.orig.Length == 0)
                {
                    continue;
                }
                toAry[0] = (byte)((table.code >> 8) & 0xff);
                if (toAry[0] != 0)
                {
                    toAry[1] = (byte)(table.code & 0xff);
                }
                else
                {
                    toAry[0] = (byte)(table.code & 0xff);
                    toAry[1] = 0;
                }
                to = System.Text.Encoding.GetEncoding(932).GetString(toAry);
                sb.Replace(table.orig, to);
            }
            ret = sb.ToString();

            // [HV] HDTV       E0F8  F2CE
            // [SD] SDTV       E0F9  F2CF
            // [手]手話通訳放送 E0FD  F23D3
            // [字]字幕放送     E0FE F2D4
            // [双]双方向放送    E0FF F2D5
            // [デ]番組連動データ放送 E180 F2D6
            // [S]ステレオ放送     E181   F2D7
            // [二]二ヶ国語放送    E182   F2D8
            // [多]音声多重放送    E183   F2D9
            // [解]音声解説        E184   F2DA
            // [SS] サラウンドステレオ E185 F2DB
            // [B] 圧縮Bモードステレオ E186 D2DC
            // [N] ニュース    E187 F2DD
            // [天] 天気予報   E18A F2E0
            // [交] 交通情報   E18B F2E1
            // [映] 劇場映画   E18C F2E2
            // [料] 有料放送   E18D F2E4
            // [前] 前編       E190 F2E6
            // [後] 後編       E191 F2E7
            // [再] 再放送     E192 F2E8
            // [新] 新番組     E193 F2E9
            // [初] 初回放送   E194 F2EA
            // [終] 最終回     E195 F2EB
            // [生] 生放送     E196 F2EC
            // [PV] ペーパービュー E19A F2F0




            //            {"[HV]",{0xE0,0xF8}},
            //{"[SD]",{0xE0,0xF9}
            //"[手]",0xE0,0xFD
            //"[字]",0xE0,0xFE
            //"[双]",0xE0,0xFF
            //"[デ]",0xE1,0x80
            //"[S]",0xE1,0x81
            //"[二]",0xE1,0x82
            //"[多]",0xE1,0x83
            //"[解]",0xE1,0x84
            //"[SS]",0xE1,0x85
            //"[B]",0xE1,0x86
            //"[N]",0xE1,0x87
            //"[天]",0xE1,0x8A
            //"[交]",0xE1,0x8B
            //"[映]",0xE1,0x8C
            //"[料]",0xE1,0x8D
            //"[前]",0xE1,0x90
            //"[後]",0xE1,0x91
            //"[再]",0xE1,0x92
            //"[新]",0xE1,0x93
            //"[初]",0xE1,0x94
            //"[終]",0xE1,0x95
            //"[生]",0xE1,0x96
            //"[PV]",0xE1,0x9A

// 現在のHTML界隈で使われることがある HTML Entities 群。
    &#009;	タブ
    &#010;	改行
    &#013;	復帰
    &#032;	スペース
!	&#033;	感嘆符
"	&quot;	&#034;	ダブルクォーテーション
//#   &#035;	シャープ
$	&#036;	ドル
%	&#037;	パーセント
&	&amp;	&#038;	アンパサンド
'	&#039;	アポストロフィ
(	&#040;	開く括弧
)	&#041;	閉じる括弧
*	&#042;	アスタリスク
+	&#043;	プラス
,	&#044;	コンマ
-	&#045;	ハイフン
.	&#046;	ピリオド
/	&#047;	スラッシュ
0~9	&#048;~&#057	数字
 	&nbsp;	&#160;	改行されないスペース
:	&#058;	コロン
;	&#059;	セミコロン
<	&lt;	&#060;	小なり
=	&#061;	イコール
>	&gt;	&#062;	大なり
?	&#063;	疑問符
@	&#064;	アットマーク
A~Z	&#065;~&#090	大文字アルファペット
Exemple #13
0
        private static void ConvertTo(HtmlNode node, TextWriter outText)
        {
            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output, also don't want titles (strong)
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style") || (parentName == "strong"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                if (html.Trim().Length > 0)
                {
                    outText.Write(HtmlEntity.DeEntitize(html));
                }
                break;

            case HtmlNodeType.Element:
                switch (node.Name)
                {
                // treat paragraphs as crlf
                case "p":
                    outText.Write("\r\n");
                    break;

                // respect NewLine aswell
                case "br":
                    if (tries < maxTries)          // set max tries
                    {
                        tries++;
                        outText.Write(Environment.NewLine);
                    }
                    break;
                }

                if (node.HasChildNodes)
                {
                    ConvertContentTo(node, outText);
                }
                break;
            }
        }
Exemple #14
0
 public static string DeEntitize(this string text)
 => string.IsNullOrEmpty(text) ? null : HtmlEntity.DeEntitize(text);
Exemple #15
0
        public MangaObject ParseMangaObject(string content)
        {
            HtmlDocument MangaObjectDocument = new HtmlDocument();

            MangaObjectDocument.LoadHtml(content);

            HtmlNode InformationNode = MangaObjectDocument.DocumentNode.SelectSingleNode("//div[contains(@class,'ipsBox')]/div");
            String   Cover           = InformationNode.SelectSingleNode(".//div[1]/img").Attributes["src"].Value;

            HtmlNode MangaProperties = InformationNode.SelectSingleNode(".//table[contains(@class,'ipb_table')]"),
                     ChapterListing  = MangaObjectDocument.DocumentNode.SelectSingleNode("//table[contains(@class,'chapters_list')]");

            String MangaName                  = HtmlEntity.DeEntitize(MangaObjectDocument.DocumentNode.SelectSingleNode("//h1[contains(@class,'ipsType_pagetitle')]").InnerText.Trim()),
                   MangaTypeProp              = HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[5]/td[2]").InnerText),
                   Desciption                 = HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[7]/td[2]").InnerText.Replace("<br>", "\n"));
            MangaObjectType MangaType         = MangaObjectType.Unknown;
            FlowDirection   PageFlowDirection = FlowDirection.RightToLeft;

            switch (MangaTypeProp.ToLower())
            {
            default:
                MangaType         = MangaObjectType.Unknown;
                PageFlowDirection = FlowDirection.RightToLeft;
                break;

            case "manga (japanese)":
                MangaType         = MangaObjectType.Manga;
                PageFlowDirection = FlowDirection.RightToLeft;
                break;

            case "manhwa (korean)":
                MangaType         = MangaObjectType.Manhwa;
                PageFlowDirection = FlowDirection.LeftToRight;
                break;

            case "manhua (chinese)":
                MangaType         = MangaObjectType.Manhua;
                PageFlowDirection = FlowDirection.LeftToRight;
                break;
            }

            HtmlNodeCollection AlternateNameNodes = MangaProperties.SelectSingleNode(".//tr[1]/td[2]").SelectNodes(".//span"),
                               GenreNodes         = MangaProperties.SelectSingleNode(".//tr[4]/td[2]").SelectNodes(".//a/span");

            String[] AlternateNames = { },
            Authors = { HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[2]/td[2]/a").InnerText) },
            Artists = { HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[3]/td[2]/a").InnerText) },
            Genres  = { };
            if (AlternateNameNodes != null && AlternateNameNodes.Count > 0)
            {
                AlternateNames = (from HtmlNode AltNameNode in AlternateNameNodes select HtmlEntity.DeEntitize(AltNameNode.InnerText.Trim())).ToArray();
            }
            if (GenreNodes != null && GenreNodes.Count > 0)
            {
                Genres = (from HtmlNode GenreNode in GenreNodes select HtmlEntity.DeEntitize(GenreNode.InnerText.Trim())).ToArray();
            }

            List <ChapterObject> Chapters     = new List <ChapterObject>();
            HtmlNodeCollection   ChapterNodes = ChapterListing.SelectNodes(String.Format(".//tr[contains(@class,'lang_{0} chapter_row')]", ExtensionDescriptionAttribute.Language));

            if (ChapterNodes != null && ChapterNodes.Count > 0)
            {
                foreach (HtmlNode ChapterNode in ChapterNodes)
                {
                    HtmlNode VolChapNameNode = ChapterNode.SelectSingleNode("td[1]/a");
                    Match    VolChapMatch = Regex.Match(VolChapNameNode.InnerText, @"(Vol\.(?<Volume>\d+)\s)?(Ch\.(?<Chapter>\d+))(\.(?<SubChapter>\d+))?");
                    String   ChapterName = VolChapNameNode.InnerText.Substring(VolChapMatch.Length + 2).Trim(),
                             ReleaseData = ReleaseData = ChapterNode.SelectSingleNode("td[5]").InnerText;
                    ChapterObject PrevChapter = Chapters.LastOrDefault();
                    UInt32        Volume = 0, Chapter = 0, SubChapter = 0;
                    if (VolChapMatch.Groups["Volume"].Success)
                    {
                        UInt32.TryParse(VolChapMatch.Groups["Volume"].Value, out Volume);
                    }
                    if (VolChapMatch.Groups["Chapter"].Success)
                    {
                        UInt32.TryParse(VolChapMatch.Groups["Chapter"].Value, out Chapter);
                    }
                    if (VolChapMatch.Groups["SubChapter"].Success)
                    {
                        UInt32.TryParse(VolChapMatch.Groups["SubChapter"].Value, out SubChapter);
                    }

                    DateTime Released = DateTime.Now;
                    if (ReleaseData.Contains("-"))
                    {
                        ReleaseData = ReleaseData.Split(new String[] { " - " }, StringSplitOptions.RemoveEmptyEntries)[0];
                        DateTime.TryParseExact(ReleaseData, "dd MMMM yyyy", CultureInfo.InvariantCulture, DateTimeStyles.None, out Released);
                    }
                    else if (ReleaseData.EndsWith("ago"))
                    {
                        String[] ReleaseDataParts = ReleaseData.Split(new Char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                        Double   Offset           = 1;
                        if (!Double.TryParse(ReleaseDataParts[0], out Offset))
                        {
                            Offset = 1;
                        }
                        Offset *= -1;
                        switch (ReleaseDataParts[1].ToLower())
                        {
                        default:
                        case "seconds":
                            Released = Released.AddSeconds(Offset);
                            break;

                        case "minutes":
                            Released = Released.AddMinutes(Offset);
                            break;

                        case "hours":
                            Released = Released.AddHours(Offset);
                            break;

                        case "days":
                            Released = Released.AddDays(Offset);
                            break;

                        case "weeks":
                            Released = Released.AddDays(7 * Offset);
                            break;
                        }
                    }

                    String ChapterUrl  = VolChapNameNode.Attributes["href"].Value;
                    String ChapterHash = ChapterUrl.Split('#').Last().Split('_').First();
                    ChapterUrl = String.Format("https://bato.to/areader?id={0}&p=1&supress_webtoon=t", ChapterHash);
                    ChapterObject chapterObject = new ChapterObject()
                    {
                        Name       = HtmlEntity.DeEntitize(ChapterName),
                        Volume     = Volume,
                        Chapter    = Chapter,
                        SubChapter = SubChapter,
                        Released   = Released,
                        Locations  =
                        {
                            new LocationObject()
                            {
                                ExtensionName     = ExtensionDescriptionAttribute.Name,
                                ExtensionLanguage = ExtensionDescriptionAttribute.Language,
                                Url = ChapterUrl
                            }
                        }
                    };
                    if (!Chapters.Any(o => o.Chapter == chapterObject.Chapter && ((Int32)o.SubChapter - chapterObject.SubChapter).InRange(-4, 4)))
                    {
                        Chapters.Add(chapterObject);
                    }
                    else
                    {
                        Chapters.Find(o => o.Chapter == chapterObject.Chapter && ((Int32)o.SubChapter - chapterObject.SubChapter).InRange(-4, 4)).Merge(chapterObject);
                    }
                }
            }
            Chapters.Reverse();

            Double Rating = -1;

            try
            {
                HtmlNode RatingNode = MangaObjectDocument.DocumentNode.SelectSingleNode("//div[contains(@class,'rating')]");
                String   RatingText = new String(RatingNode.InnerText.Trim().Substring(1, 4).Where(IsValidRatingChar).ToArray());
                Double.TryParse(RatingText, out Rating);
            }
            catch { }

            return(new MangaObject()
            {
                Name = MangaName,
                MangaType = MangaType,
                PageFlowDirection = PageFlowDirection,
                Description = HtmlEntity.DeEntitize(Desciption),
                AlternateNames = AlternateNames.ToList(),
                CoverLocations = { new LocationObject()
                                   {
                                       Url = Cover,
                                       ExtensionName = ExtensionDescriptionAttribute.Name,
                                       ExtensionLanguage = ExtensionDescriptionAttribute.Language
                                   } },
                Authors = Authors.ToList(),
                Artists = Artists.ToList(),
                Genres = Genres.ToList(),
                Released = (Chapters.FirstOrDefault() ?? new ChapterObject()).Released,
                Chapters = Chapters,
                Rating = Rating
            });
        }
Exemple #16
0
        private static void ParseSharliInput(string input)
        {
            // Setting up the base nodes : root, personal bar, tags, unfiltered, base
            Node baseNode = new Node(null,
                                     "",
                                     1,
                                     null,
                                     DateTime.Now.Ticks,
                                     DateTime.Now.Ticks,
                                     null,
                                     Node.TypeMozPlaceContainer,
                                     "placesRoot",
                                     new List <Node>());

            baseNode.Children.Add(new Node(null,
                                           "Menu des marques-pages",
                                           2,
                                           1,
                                           DateTime.Now.Ticks,
                                           DateTime.Now.Ticks,
                                           null,
                                           Node.TypeMozPlaceContainer,
                                           "bookmarksMenuFolder",
                                           null));
            baseNode.Children.Add(new Node(1,
                                           "Barre Personnelle",
                                           3,
                                           1,
                                           DateTime.Now.Ticks,
                                           DateTime.Now.Ticks,
                                           null,
                                           Node.TypeMozPlaceContainer,
                                           "toolbarFolder",
                                           null));
            baseNode.Children.Add(new Node(2,
                                           "Étiquettes",
                                           4,
                                           1,
                                           DateTime.Now.Ticks,
                                           DateTime.Now.Ticks,
                                           null,
                                           Node.TypeMozPlaceContainer,
                                           "tagsFolder",
                                           new List <Node>()));
            baseNode.Children.Add(new Node(3,
                                           "Marque-pages non classés",
                                           5,
                                           1,
                                           DateTime.Now.Ticks,
                                           DateTime.Now.Ticks,
                                           null,
                                           Node.TypeMozPlaceContainer,
                                           "unfiledBookmarksFolder",
                                           new List <Node>()));

            // Getting the HTML in a better state, loading it into a HTMLDocument
            HtmlDocument html     = new HtmlDocument();
            FileInfo     filePath = new FileInfo(input);

            try
            {
                using (System.IO.StreamReader sr = new System.IO.StreamReader(filePath.FullName))
                {
                    string strHtml = sr.ReadToEnd();
                    strHtml = strHtml.Replace("</A>", "</A></DT>");
                    // Parsing HTML with a Regex.
                    // Might summon Zalgo.
                    // This finds the malformed <DD> tags (not closed).

                    var regex = new System.Text.RegularExpressions.Regex(@"(?<!\>)\n<DT>");
                    strHtml = regex.Replace(strHtml, delegate(System.Text.RegularExpressions.Match m)
                    {
                        return("</DD>" + m.Value);
                    });

                    // Fixing html entities
                    strHtml = HtmlEntity.DeEntitize(strHtml);
                    html.LoadHtml(strHtml);
                }
            }
            catch (IOException ex)
            {
                Console.WriteLine("Failure : File does not exist : " + input);
                return;
            }


            Console.WriteLine(html.DocumentNode.ChildNodes.Count(node => String.Equals(node.Name, "DT", StringComparison.InvariantCultureIgnoreCase)) + " links found.");


            HtmlNodeCollection nodes = html.DocumentNode.ChildNodes;

            /*  Foreach node (tag) in the HTML :
             *   * Look up its name
             *   * If it is a <DT> tag, we found a link
             *   * Look up its child nodes for an <A> tag
             *   * Look up if it has an URL starting with '?', it is a Sharli link, and we ignore it.
             *   * Set the Name as what is linked in the <A> tag
             *   * Look up if it is followed by a <DD> tag, its description
             */
            for (int i = 0; i < nodes.Count; i++)
            {
                if (String.Equals(nodes[i].Name, "DT", StringComparison.InvariantCultureIgnoreCase) && i + 2 < nodes.Count)
                {
                    Node annotation = null;

                    // Look up if the next tag is a <DD> tag. If it is, it's this link's description
                    // i+1 would point to a newline tag. (why would you even do that HAP ?
                    if (String.Equals(nodes[i + 2].Name, "dd", StringComparison.InvariantCultureIgnoreCase))
                    {
                        annotation = new Node("bookmarkProperties/description",
                                              0,
                                              4,
                                              null,
                                              3,
                                              nodes[i + 2].InnerText);
                    }

                    // Shaarli URLs (pointing to itself) causes the bookmark import to fail.
                    // We simply make them point to about:blank
                    string url = nodes[i].ChildNodes[0].Attributes["HREF"].Value;
                    if (url[0] == '?')
                    {
                        url = "about:blank";
                    }
                    baseNode.Children[3].Children.Add(new Node(null,
                                                               nodes[i].ChildNodes[0].InnerText,
                                                               i,
                                                               5,
                                                               DateTime.Now.Ticks,
                                                               DateTime.Now.Ticks,
                                                               "text/x-moz-place",
                                                               url,
                                                               annotation));

                    // If this tag has one children which is an <A> tag, it's a link. We get its target.
                    if (HasATagChild(nodes[i], "a"))
                    {
                        HtmlNode currentNode = nodes[i].ChildNodes[0];

                        #region Getting TAGS
                        // If this <A> tag has any "TAGS" attribute, they've been defined by the user, and we're getting them
                        if (currentNode.Attributes.Any(a => String.Equals(a.Name, "TAGS", StringComparison.InvariantCultureIgnoreCase)))
                        {
                            string[] tags = currentNode.Attributes["TAGS"].Value.Split(',');
                            foreach (string tag in tags)
                            {
                                // If there is already one children of the tags which has this tagname, we simply add one children to it
                                if (baseNode.Children[2].Children.Any(c => c.Title == tag))
                                {
                                    Node targetNode = baseNode.Children[2]
                                                      .Children
                                                      .First(c => c.Title == tag);
                                    targetNode.Children.Add(new Node(null,
                                                                     "",
                                                                     targetNode.ID + targetNode.Children.Count,
                                                                     targetNode.ID,
                                                                     DateTime.Now.Ticks,
                                                                     DateTime.Now.Ticks,
                                                                     "text/x-moz-place",
                                                                     currentNode.Attributes["HREF"].Value,
                                                                     annotation));
                                }
                                else
                                {
                                    baseNode.Children[2].Children.Add(new Node(null,
                                                                               tag,
                                                                               i,
                                                                               4,
                                                                               DateTime.Now.Ticks,
                                                                               DateTime.Now.Ticks,
                                                                               null,
                                                                               Node.TypeMozPlaceContainer,
                                                                               "",
                                                                               new List <Node>()));
                                    Node targetNode = baseNode.Children[2]
                                                      .Children
                                                      .First(c => c.Title.ToLower() == tag.ToLower());

                                    targetNode.Children.Add(new Node(null,
                                                                     "",
                                                                     targetNode.ID + targetNode.Children.Count,
                                                                     targetNode.ID,
                                                                     DateTime.Now.Ticks,
                                                                     DateTime.Now.Ticks,
                                                                     "text/x-moz-place",
                                                                     currentNode.Attributes["HREF"].Value,
                                                                     annotation));
                                }
                            }
                        }
                        #endregion
                    }
                }
            }
            Serialize <Node>(baseNode, input + ".export.json");
            Console.WriteLine("Sucess ! output written to " + input + ".export.json.");
        }
Exemple #17
0
        public List <SearchResultObject> ParseSearch(string content)
        {
            List <SearchResultObject> SearchResults = new List <SearchResultObject>();
            Regex        IdMatch = new Regex(@"r\d+");
            HtmlDocument SearchResultDocument = new HtmlDocument();

            SearchResultDocument.LoadHtml(content);
            HtmlWeb            HtmlWeb           = new HtmlWeb();
            HtmlNodeCollection HtmlSearchResults = SearchResultDocument.DocumentNode.SelectNodes("//table[contains(@class,'ipb_table chapters_list')]/tbody/tr[not(contains(@class,'header'))]");

            if (!Equals(HtmlSearchResults, null))
            {
                foreach (HtmlNode SearchResultNode in HtmlSearchResults)
                {
                    HtmlNode NameLink = SearchResultNode.SelectSingleNode(".//td[1]/strong/a");
                    if (NameLink != null)
                    {
                        Int32  Id            = -1;
                        String Name          = HtmlEntity.DeEntitize(NameLink.InnerText).Trim(),
                               Link          = NameLink.Attributes["href"].Value,
                               Description   = null;
                        LocationObject Cover = null;
                        if (Int32.TryParse(IdMatch.Match(Link).Value.Substring(1), out Id))
                        {
                            HtmlDocument PopDocument     = HtmlWeb.Load(String.Format("{0}/comic_pop?id={1}", ExtensionDescriptionAttribute.RootUrl, Id));
                            HtmlNode     CoverNode       = PopDocument.DocumentNode.SelectSingleNode("//img"),
                                         DescriptionNode = PopDocument.DocumentNode.SelectSingleNode("//table/tbody/tr[6]/td[2]");
                            if (!HtmlNode.Equals(CoverNode, null))
                            {
                                Cover = new LocationObject()
                                {
                                    Url               = CoverNode.Attributes["src"].Value,
                                    ExtensionName     = ExtensionDescriptionAttribute.Name,
                                    ExtensionLanguage = ExtensionDescriptionAttribute.Language
                                }
                            }
                            ;
                            if (!HtmlNode.Equals(DescriptionNode, null))
                            {
                                Description = DescriptionNode.InnerText.Trim();
                            }
                        }
                        String[] Author_Artists = { SearchResultNode.SelectSingleNode(".//td[2]").InnerText.Trim() };
                        SearchResults.Add(new SearchResultObject()
                        {
                            Cover             = Cover,
                            Description       = Description,
                            ExtensionName     = ExtensionDescriptionAttribute.Name,
                            ExtensionLanguage = ExtensionDescriptionAttribute.Language,
                            Name    = Name,
                            Url     = Link,
                            Id      = Id.ToString(),
                            Rating  = Double.Parse(SearchResultNode.SelectSingleNode(".//td[3]/div").Attributes["title"].Value.Substring(0, 4)),
                            Artists = Author_Artists.ToList(),
                            Authors = Author_Artists.ToList()
                        });
                    }
                }
            }
            return(SearchResults);
        }
    }
        /// <summary>
        /// Reduces the document.
        /// </summary>
        /// <param name="htmlInput">The HTML input.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public String ReduceDocument(String htmlInput)
        {
            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.OptionFixNestedTags  = true;
            htmlDocument.OptionAutoCloseOnEnd = true;


            htmlDocument.LoadHtml(htmlInput);

            List <HtmlNode> htmlNodes = htmlDocument.DocumentNode.ChildNodes.ToList();


            var tagNameReplacements  = settings.tagNameReplacement.GetDictionary();
            var attWithValueToRemove = settings.attributeWithValueToRemove.GetDictionary();

            // first phase
            while (htmlNodes.Any())
            {
                List <HtmlNode> nextIteration = new List <HtmlNode>();

                foreach (HtmlNode node in htmlNodes)
                {
                    String nodeName = node.Name.ToLower();

                    if (tagNameReplacements.ContainsKey(nodeName))
                    {
                        node.Name = tagNameReplacements[nodeName].value;
                        nodeName  = tagNameReplacements[nodeName].value;
                    }


                    if (settings.tagsToRemove.Contains(nodeName)) //.Any(x => x.Equals(node.Name, StringComparison.InvariantCultureIgnoreCase)))
                    {
                        node.Remove();
                    }
                    else
                    {
                        Boolean addToNextIteration = true;

                        if (settings.emptyTagsToRemove.Contains(nodeName)) //.Any(x=>x.Equals(node.Name, StringComparison.InvariantCultureIgnoreCase))) {
                        {
                            if (IsNodeEmpty(node))
                            {
                                node.Remove();
                                addToNextIteration = false;
                            }
                        }

                        if (settings.tagsToRemoveAllAttributes.Contains(nodeName)) //.Any(x=>x.Equals(node.Name, StringComparison.InvariantCultureIgnoreCase))) {
                        {
                            node.Attributes.RemoveAll();
                        }


                        if (addToNextIteration)
                        {
                            nextIteration.Add(node);

                            foreach (var attribute in node.Attributes.ToList())
                            {
                                String attributeName = attribute.Name.ToLower();

                                if (settings.attributesToRemove.Contains(attributeName)) //.Any(x => x.Equals(attribute.Name, StringComparison.InvariantCultureIgnoreCase)))
                                {
                                    attribute.Remove();
                                }

                                if (attribute.Value.isNullOrEmpty())
                                {
                                    //attribute.Value = " ";
                                    //attribute.Remove();
                                }
                                else
                                {
                                    if (attWithValueToRemove.ContainsKey(attributeName))
                                    {
                                        if (attribute.Value.toStringSafe() == attWithValueToRemove[attributeName].value)
                                        {
                                            attribute.Remove();
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                htmlNodes = new List <HtmlNode>();
                foreach (HtmlNode node in nextIteration)
                {
                    htmlNodes.AddRange(node.ChildNodes.ToList());
                }
            }

            String outputHtml = htmlDocument.DocumentNode.OuterHtml;

            if (settings.ReduceEmptySpace)
            {
                outputHtml = HtmlEntity.DeEntitize(outputHtml);



                outputHtml = REGEX_SELECTCOMMENTS.Replace(outputHtml, "");



                outputHtml = REGEX_EMPTYSPACE.Replace(outputHtml, ">" + Environment.NewLine + "<");

                outputHtml = outputHtml.Replace("><", ">" + Environment.NewLine + "<");

                String doubleNewLine = Environment.NewLine + Environment.NewLine;

                Int32 i = 0;
                while (outputHtml.IndexOf(doubleNewLine) > 0)
                {
                    outputHtml = outputHtml.Replace(doubleNewLine, Environment.NewLine);
                    i++;

                    if (i > 100)
                    {
                        break;
                    }
                }
            }

            if (settings.RebuildHtml)
            {
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(outputHtml);

                StringBuilder sb = new StringBuilder();

                RenderNode(document.DocumentNode, sb, 0);

                outputHtml = sb.ToString();
            }

            if (settings.InsertReductionSignature)
            {
                String headerComment = RenderCommentNode("imbSCI.DataExtraction - Reduced HTML document");
                outputHtml = headerComment + Environment.NewLine + outputHtml;
            }

            return(outputHtml);
        }
        public void DeEntitize()
        {
            var html = @"mouse&apos;s house";

            Assert.AreEqual("mouse's house", HtmlEntity.DeEntitize("mouse&apos;s house"));
        }
Exemple #20
0
        public static GrantItemInfo ToGrantInfo(this RawGrantItemInfo info)
        {
            var regTitle   = new Regex(@"\[(?<type>.*)\]&nbsp;(?<title>.*)");
            var titleMatch = regTitle.Match(info.Title);
            var title      = titleMatch.Groups["title"].Value;
            var type       = titleMatch.Groups["type"].Value;

            var docDetails = new HtmlDocument();

            docDetails.LoadHtml(info.Details);
            var lis     = docDetails.DocumentNode.CssSelect("li");
            var details = new List <GrantDetailInfo>();

            foreach (var li in lis)
            {
                if (string.IsNullOrWhiteSpace(li.InnerText))
                {
                    continue;
                }
                var text = string.Join("", li.ChildNodes.Select(_ =>
                {
                    if (_.NodeType == HtmlNodeType.Text)
                    {
                        return(_.InnerText);
                    }
                    else if (_.NodeType == HtmlNodeType.Element && _.Name == "div" &&
                             _.ChildNodes.Count == 1 && _.ChildNodes.First().NodeType == HtmlNodeType.Text)
                    {
                        return(_.ChildNodes.First().InnerText);
                    }
                    else
                    {
                        return(null);
                    }
                }));
                text = HtmlEntity.DeEntitize(text);
                var segs = text.SplitTwo(":");
                if (segs.Length == 1)
                {
                    var last = details.Last();
                    segs = new[] { last.Name, string.Join(";", last.Values.Concat(new[] { segs[0] })) };
                    details.Remove(last);
                }
                else if (segs.Length > 2)
                {
                    throw new Exception();
                }

                var vals = segs[1]
                           .Split(new[] { ";" }, StringSplitOptions.RemoveEmptyEntries)
                           .Select(_ => _.Trim())
                           .ToArray();
                details.Add(new GrantDetailInfo {
                    Name = segs[0].Trim(), Values = vals
                });
            }

            // parse description
            var docDesc = new HtmlDocument();

            docDesc.LoadHtml(info.Description);
            var desc        = docDesc.DocumentNode.InnerText;
            var leadingDesc = "";

            if (desc.EndsWith("全部"))
            {
                desc = desc.Substring(0, desc.Length - 2).Trim();
            }

            desc = HtmlEntity.DeEntitize(desc);
            var d = desc.SplitTwo(":");

            leadingDesc = d[0].Trim();
            desc        = d[1].Trim();

            details.Add(new GrantDetailInfo {
                Name = leadingDesc, Values = new[] { desc }
            });

            // parse links
            var rePam    = new Regex(@"javascript\:pam3\('(?<type>[piudg]{3})','(?<id>.+)','(?<index>\d?)'\);");
            var reTx     = new Regex(@"javascript\:sw_xx\('(?<number>.*)'\);");
            var docLinks = new HtmlDocument();

            docLinks.LoadHtml(info.Links);
            var links = docLinks.DocumentNode.CssSelect("span a")
                        .Select(link => new { href = link.GetAttributeValue("href"), text = link.InnerText })
                        .Select(link =>
            {
                var pamMatch = rePam.Match(link.href);
                if (pamMatch.Success)
                {
                    var pamType  = pamMatch.Groups["type"].Value;
                    var pamId    = pamMatch.Groups["id"].Value;
                    var pamIndex = pamMatch.Groups["index"].Value;
                    return((GrantItemLinkBase) new GrantItemPamLink
                    {
                        Title = link.text,
                        Type = pamType,
                        Id = pamId,
                        Index = pamIndex,
                    });
                }

                var txMatch = reTx.Match(link.href);
                if (txMatch.Success)
                {
                    var txNumber = txMatch.Groups["number"].Value;
                    return(new GrantItemTxLink
                    {
                        Title = link.text,
                        Number = txNumber,
                    });
                }

                throw new NotSupportedException("cannot parse link");
            })
                        .ToArray();

            var imageUrl = Regex.Replace(info.Image, "_thumb.jpg$", ".jpg");

            return(new GrantItemInfo
            {
                Id = info.Id,
                Details = details.ToArray(),
                ThumbImage = info.Image,
                Image = imageUrl,
                Links = links,
                QrImage = info.QrImage,
                Title = title,
                Type = type,
            });
        }
        private static AnimeListData AnalyzeDocument(string HTMLCode, string link)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(HTMLCode);

            AnimeListData list = new AnimeListData();

            try
            {
                foreach (var animeNode in doc.DocumentNode.Descendants("div").Where(x => x.GetAttributeValue("class", "") == "seasonal-anime js-seasonal-anime"))
                {
                    CoreAnimeEntry page = new CoreAnimeEntry()
                    {
                        Title     = Utility.FixString(HtmlEntity.DeEntitize(animeNode.Descendants("p").First(x => x.GetAttributeValue("class", "") == "title-text").InnerText)),
                        AnimeLink = new LinkInfo(animeNode.Descendants("p").First(x => x.GetAttributeValue("class", "") == "title-text").Descendants("a").First().GetAttributeValue("href", "")),
                        Episodes  = Utility.FixString(HtmlEntity.DeEntitize(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "eps").InnerText)),
                        ImageLink = new LinkInfo(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "image").Descendants().First().GetAttributeValue("src", "")),
                        Synopsis  = HtmlEntity.DeEntitize(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "synopsis js-synopsis").InnerText),
                    };

                    if (animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "source").InnerText.Length > 1)
                    {
                        page.Source = (AnimeSourceType)Enum.Parse(typeof(AnimeSourceType), Utility.FixEnum(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "source").InnerText));
                    }
                    LinkInfo prod = new LinkInfo()
                    {
                        Name = Utility.FixString(HtmlEntity.DeEntitize(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "producer").InnerText))
                    };

                    if (animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "producer").Descendants("a").Any())
                    {
                        prod.Path = Utility.GetCorrectLinkFormat(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "producer").Descendants("a").First().GetAttributeValue("href", ""));
                    }
                    page.Studios.Add(prod);
                    animeNode.Descendants("span").Where(x => x.GetAttributeValue("span", "") == "genre").ToList().ForEach(x =>
                    {
                        page.Genres.Add(Utility.FixString(x.InnerText));
                    });

                    float.TryParse(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "score").InnerText, out float score);
                    page.Members = Utility.GetIntFromString(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "member fl-r").InnerText);

                    string[] str = HtmlEntity.DeEntitize(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "info").InnerText).Split('-');
                    page.Type  = Utility.FixString(str[0].Replace(" ", ""));
                    page.Aired = Utility.FixString(str[1].Split(',').Take(2).Aggregate((x, y) => x + ", " + y));
                    if (str[1].Split(',').Length > 2)
                    {
                        page.Broadcast = Utility.FixString(str[1].Split(',')[2]);
                    }

                    list.Animes.Add(page);
                }

                if (doc.DocumentNode.Descendants().Any(x => x.GetAttributeValue("class", "") == "mt12 mb12"))
                {
                    int currOffset = Utility.GetIntFromString(doc.DocumentNode.Descendants().First(x => x.GetAttributeValue("class", "") == "link current").GetAttributeValue("href", "").Split('=').Last());

                    var links     = doc.DocumentNode.Descendants().Where(x => x.GetAttributeValue("class", "") == "link");
                    int minOffset = Utility.GetIntFromString(links.First().GetAttributeValue("href", "").Split('=').Last());
                    int maxOffset = Utility.GetIntFromString(links.Last().GetAttributeValue("href", "").Split('=').Last());

                    string newLink = "";
                    if (link.Contains("?page="))
                    {
                        newLink = link.Split('=')[0] + "=";
                    }
                    else
                    {
                        newLink = link + "?page=";
                    }

                    if (currOffset > minOffset)
                    {
                        list.PreviousPageLink = new LinkInfo(newLink + (currOffset - 1));
                    }
                    if (currOffset < maxOffset)
                    {
                        list.NextPageLink = new LinkInfo(newLink + (currOffset + 1));
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message + ex.StackTrace + link);
            }

            return(list);
        }
 /// <summary>
 /// HTML Decodes a String so any character entities used are converted to their actual characters
 /// </summary>
 /// <param name="value">Value to decode</param>
 /// <returns></returns>
 public static String HtmlDecode(String value)
 {
     return(HtmlEntity.DeEntitize(value));
 }
 public static string GetDecodedInnerText(this HtmlNode htmlNode) => HtmlEntity.DeEntitize(htmlNode.InnerText);
Exemple #24
0
        private static void ConvertTo(HtmlNode node, TextWriter outText)
        {
            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of white spaces
                if (html.Trim().Length > 0)
                {
                    outText.Write(HtmlEntity.DeEntitize(html));
                }
                break;

            case HtmlNodeType.Element:
                switch (node.Name)
                {
                case "p":
                    // treat paragraphs as crlf
                    outText.Write(Environment.NewLine);
                    break;

                case "br":
                    outText.Write(Environment.NewLine);
                    break;

                case "a":
                    HtmlAttribute att = node.Attributes["href"];
                    outText.Write($"<{att.Value}>");
                    break;
                }

                if (node.HasChildNodes)
                {
                    ConvertContentTo(node, outText);
                }
                break;
            }
        }
    internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
    {
        string html;

        switch (node.NodeType)
        {
        case HtmlNodeType.Comment:
            // don't output comments
            break;

        case HtmlNodeType.Document:
            ConvertContentTo(node, outText, textInfo);
            break;

        case HtmlNodeType.Text:
            // script and style must not be output
            string parentName = node.ParentNode.Name;
            if ((parentName == "script") || (parentName == "style"))
            {
                break;
            }
            // get text
            html = ((HtmlTextNode)node).Text;
            // is it in fact a special closing node output as text?
            if (HtmlNode.IsOverlappedClosingElement(html))
            {
                break;
            }
            // check the text is meaningful and not a bunch of whitespaces
            if (html.Length == 0)
            {
                break;
            }
            if (!textInfo.FirstTextOfBlockWritten || textInfo.LastCharWasSpace)
            {
                html = html.TrimStart();
                if (html.Length == 0)
                {
                    break;
                }
                textInfo.FirstTextOfBlockWritten = true;
            }
            outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
            if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
            {
                outText.Write(' ');
            }
            break;

        case HtmlNodeType.Element:
            string endElementString = null;
            bool   isInline;
            switch (node.Name)
            {
            case "p":
            case "div":             // stylistic - adjust as you tend to use
                if (textInfo.IsFirstElementOfDoc)
                {
                    textInfo.IsFirstElementOfDoc = false;
                }
                else
                {
                    outText.Write("\r\n");
                }
                endElementString = "\r\n";
                isInline         = false;
                break;

            case "a":
                if (node.Attributes.Contains("href"))
                {
                    string href = node.Attributes["href"].Value;
                    if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase) == -1)
                    {
                        endElementString = "<" + href + ">";
                    }
                }
                isInline = true;
                break;

            case "li":                    //not doing ol li elements at this stage
                outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
                isInline = false;
                break;

            case "ul":
                endElementString = "\r\n";
                isInline         = false;
                break;

            case "img":             //inline-block in reality, but KISS
                if (node.Attributes.Contains("alt"))
                {
                    outText.Write('[' + node.Attributes["alt"].Value);
                    endElementString = "]";
                }
                if (node.Attributes.Contains("src"))
                {
                    outText.Write('<' + node.Attributes["alt"].Value + '>');
                }
                isInline = true;
                break;

            case "span":
            case "strong":
            case "em":
                isInline = true;
                break;

            default:
                isInline = false;
                break;
            }
            if (node.HasChildNodes)
            {
                ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo());
                if (endElementString != null)
                {
                    outText.Write(endElementString);
                }
            }
            break;
        }
    }
Exemple #26
0
        private static bool ConvertTo(HtmlNode node, TextWriter outText)
        {
            var result = false;

            switch (node.NodeType)
            {
            case HtmlNodeType.Document:
                result = ConvertContentTo(node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                var parentName = node.ParentNode.Name;
                if (parentName == "script" || parentName == "style")
                {
                    break;
                }

                // get text
                var html = ((HtmlTextNode)node).Text;
                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                var text = HtmlEntity.DeEntitize(html.Replace("\r\n", " ").Replace("\n", " ").Trim());
                if (string.IsNullOrEmpty(text))
                {
                    break;
                }

                result = true;
                outText.Write(text);
                break;

            case HtmlNodeType.Element:
                switch (node.Name)
                {
                case "br":
                    outText.WriteLine();
                    return(false);

                case "hr":
                    outText.WriteLine(new string('_', 32));
                    return(false);

                case "img":
                    var alt = node.GetAttributeValue("alt", null).Trim();
                    if (!string.IsNullOrEmpty(alt))
                    {
                        result = true;
                        outText.Write($"[{alt}]");
                    }
                    break;

                case "li":
                    outText.Write("- ");
                    break;
                }

                if (node.HasChildNodes)
                {
                    result = ConvertContentTo(node, outText);
                }

                if (result)
                {
                    switch (node.Name)
                    {
                    case "p":
                    case "div":
                    case "tr":
                    case "li":
                        outText.WriteLine();
                        break;

                    case "a":
                        var href = node.GetAttributeValue("href", null);
                        if (!string.IsNullOrEmpty(href))
                        {
                            outText.Write($"<{href}>");
                        }
                        break;
                    }
                }
                break;
            }

            return(result);
        }
        private static void ConvertTo(HtmlNode node, TextWriter outText, PrecedingDomTextInfo textInfo)
        {
            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText, textInfo);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;
                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                if (html.Length == 0)
                {
                    break;
                }

                if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
                {
                    html = html.TrimStart();
                    if (html.Length == 0)
                    {
                        break;
                    }

                    textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
                }

                outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
                if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
                {
                    outText.Write(' ');
                }

                break;

            case HtmlNodeType.Element:
                string?endElementString = null;
                bool   isInline;
                bool   skip      = false;
                int    listIndex = 0;
                switch (node.Name)
                {
                case "nav":
                    skip     = true;
                    isInline = false;
                    break;

                case "body":
                case "section":
                case "article":
                case "aside":
                case "header":
                case "footer":
                case "address":
                case "main":
                case "div":
                case "p":         // stylistic - adjust as you tend to use
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("\r\n");
                    }

                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "em":
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("_");
                    }

                    endElementString = "_";
                    isInline         = true;
                    break;

                case "strong":
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("__");
                    }

                    endElementString = "__";
                    isInline         = true;
                    break;

                case "h1":
                case "h2":
                case "h3":
                case "h4":
                case "h5":
                case "h6":
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("\r\n");
                        int level = int.Parse(node.Name.Substring(1));
                        outText.Write(new string('#', level));
                        outText.Write(" ");
                    }

                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "br":
                    outText.Write("\r\n");
                    skip = true;
                    textInfo.WritePrecedingWhiteSpace = false;
                    isInline = true;
                    break;

                case "a":
                    if (node.Attributes.Contains("href"))
                    {
                        string href = node.Attributes["href"].Value?.Trim() ?? "";
                        if (!string.IsNullOrEmpty(href) && !href.StartsWith("data:"))
                        {
                            outText.Write("[");
                            endElementString = $"]({href})";
                        }
                    }

                    isInline = true;
                    break;

                case "code":
                    if (textInfo.IsFirstTextOfDocWritten)
                    {
                        outText.Write("`");
                    }

                    endElementString = "`";
                    isInline         = true;
                    break;

                case "li":
                    if (textInfo.ListIndex > 0)
                    {
                        outText.Write("\r\n{0}.\t", textInfo.ListIndex++);
                    }
                    else
                    {
                        outText.Write(
                            "\r\n*\t");         //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
                    }

                    isInline = false;
                    break;

                case "ol":
                    listIndex = 1;
                    goto case "ul";

                case "ul"
                    :         //not handling nested lists any differently at this stage - that is getting close to rendering problems
                    endElementString = "\r\n";
                    isInline         = false;
                    break;

                case "img":         //inline-block in reality
                    if (node.Attributes.Contains("alt"))
                    {
                        outText.Write('[' + node.Attributes["alt"].Value);
                        endElementString = "]";
                    }

                    if (node.Attributes.Contains("src"))
                    {
                        outText.Write('<' + node.Attributes["src"].Value + '>');
                    }

                    isInline = true;
                    break;

                default:
                    isInline = true;
                    break;
                }

                if (!skip && node.HasChildNodes)
                {
                    ConvertContentTo(node, outText,
                                     isInline
                                ? textInfo
                                : new PrecedingDomTextInfo(textInfo.IsFirstTextOfDocWritten)
                    {
                        ListIndex = listIndex
                    });
                }

                if (endElementString != null)
                {
                    outText.Write(endElementString);
                }

                break;
            }
        }
Exemple #28
0
        private void ConvertToText(HtmlNode node, TextWriter outText)
        {
            if (hasContentEnd)
            {
                return;
            }

            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                if (html.Trim().Length > 0)
                {
                    outText.Write(HtmlEntity.DeEntitize(html));
                }
                break;

            case HtmlNodeType.Element:
                bool isHeading = false, isList = false, isCode = false;
                switch (node.Name)
                {
                case "pre":
                    isCode = true;
                    outText.Write("\r\n^\r\n");
                    break;

                case "ol":
                case "ul":
                    isList = true;
                    outText.Write("\r\n⌐\r\n");
                    break;

                case "li":
                    outText.Write("\r\n● ");
                    break;

                case "div":
                    outText.Write("\r\n");
                    if (hasH1 && !hasContentEnd)
                    {
                        var css = node.getAttribute("class");
                        if (css != null && css.Length > 0)
                        {
                            bool is_end_content = DIV_CLASS_END.Where(x => css.IndexOf(x) != -1).Count() > 0;
                            if (is_end_content)
                            {
                                hasContentEnd = true;
                            }
                        }
                    }
                    break;

                case "p":
                    outText.Write("\r\n");
                    break;

                case "h2":
                case "h3":
                case "h4":
                case "h5":
                case "h6":
                    isHeading = true;
                    outText.Write("\r\n■ ");
                    break;

                case "h1":
                    hasH1 = true;
                    outText.Write("\r\n{H1}\r\n");
                    break;

                case "img":
                    var src = node.getAttribute("src");
                    if (!string.IsNullOrEmpty(src))
                    {
                        outText.Write("\r\n{IMG-" + src + "-IMG}\r\n");
                    }

                    break;
                }

                if (node.HasChildNodes)
                {
                    ConvertContentTo(node, outText);
                }

                if (isHeading)
                {
                    outText.Write("\r\n");
                }
                if (isList)
                {
                    outText.Write("\r\n┘\r\n");
                }
                if (isCode)
                {
                    outText.Write("\r\nⱽ\r\n");
                }

                break;
            }
        }
Exemple #29
0
        private static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable <HtmlNode> nodes)
        {
            foreach (HtmlNode node in nodes)
            {
                if (node is HtmlTextNode)
                {
                    HtmlTextNode text  = (HtmlTextNode)node;
                    char[]       chars = HtmlEntity.DeEntitize(text.Text).ToCharArray();
                    foreach (char ch in chars)
                    {
                        if (char.IsWhiteSpace(ch))
                        {
                            if (ch == 0xA0 || ch == 0x2007 || ch == 0x202F)
                            {
                                if (state == ToPlainTextState.WhiteSpace)
                                {
                                    builder.Append(' ');
                                }
                                builder.Append(' ');
                                state = ToPlainTextState.NotWhiteSpace;
                            }
                            else
                            {
                                if (state == ToPlainTextState.NotWhiteSpace)
                                {
                                    state = ToPlainTextState.WhiteSpace;
                                }
                            }
                        }
                        else
                        {
                            if (state == ToPlainTextState.WhiteSpace)
                            {
                                builder.Append(' ');
                            }
                            builder.Append(ch);
                            state = ToPlainTextState.NotWhiteSpace;
                        }
                    }
                }
                else
                {
                    string tag = node.Name.ToLower();

                    if (tag == "br")
                    {
                        builder.AppendLine();
                        state = ToPlainTextState.StartLine;
                    }
                    else if (NonVisibleTags.Contains(tag))
                    {
                    }
                    else if (InlineTags.Contains(tag))
                    {
                        Plain(builder, ref state, node.ChildNodes);
                    }
                    else
                    {
                        if (state != ToPlainTextState.StartLine)
                        {
                            builder.AppendLine();
                            state = ToPlainTextState.StartLine;
                        }
                        Plain(builder, ref state, node.ChildNodes);
                        if (state != ToPlainTextState.StartLine)
                        {
                            builder.AppendLine();
                            state = ToPlainTextState.StartLine;
                        }
                    }
                }
            }
        }
Exemple #30
0
        public void ConvertTo(HtmlNode node, TextWriter outText)
        {
            if (Filters != null)
            {
                if (Filters.Select(x => x.TrimStart('#')).Contains(node.Id.Trim()))
                {
                    return;
                }
                if (node.Attributes.Contains("class") &&
                    Filters.Select(x => x.TrimStart('.')).Contains(node.Attributes["class"].Value.Trim()))
                {
                    return;
                }
            }

            string html;

            switch (node.NodeType)
            {
            case HtmlNodeType.Comment:
                // don't output comments
                break;

            case HtmlNodeType.Document:
                ConvertContentTo(node, outText);
                break;

            case HtmlNodeType.Text:
                // script and style must not be output
                string parentName = node.ParentNode.Name;
                if ((parentName == "script") || (parentName == "style"))
                {
                    break;
                }

                // get text
                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?
                if (HtmlNode.IsOverlappedClosingElement(html))
                {
                    break;
                }

                // check the text is meaningful and not a bunch of whitespaces
                if (html.Trim().Length > 0)
                {
                    foreach (var c in trimChars)
                    {
                        html = html.Replace(c, ' ');
                    }

                    outText.Write(HtmlEntity.DeEntitize(html).Trim(trimChars));
                }
                break;

            case HtmlNodeType.Element:
                bool skip = false;
                switch (node.Name.ToLower())
                {
                case "title":
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText);
                    }
                    skip = true;
                    break;

                case "meta":
                    //extract description
                    if (node.GetAttributeValue("name", "") == "description")
                    {
                        outText.Write(Environment.NewLine);
                        outText.Write(node.GetAttributeValue("content", ""));
                        outText.Write(Environment.NewLine);
                    }
                    break;

                //handle headers
                case "h1":
                case "h2":
                case "h3":
                    outText.Write(Environment.NewLine);
                    outText.Write(Environment.NewLine);
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText);
                    }
                    outText.Write(Environment.NewLine);
                    skip = true;
                    break;

                case "p":
                case "ul":
                case "ol":
                case "div":
                case "br":
                    // treat paragraphs as crlf
                    outText.Write(Environment.NewLine);
                    break;

                case "li":
                    outText.Write(Environment.NewLine + "* ");
                    break;

                case "img":
                    outText.Write(Environment.NewLine + imageLinkTextHighlight + "[img:" + node.Attributes["alt"]?.Value + "]" + resetColor);
                    if (node.Attributes.Contains("src"))
                    {
                        var uriName = node.Attributes["src"].Value;
                        Uri uriResult;
                        if (Uri.TryCreate(uriName, UriKind.Absolute, out uriResult) &&
                            (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps))
                        {
                            Images.Add(uriResult);
                            outText.Write(imageLinkHighlight + "[" + Images.Count + "] " + resetColor);
                        }
                        else if (Uri.TryCreate(BaseUri, uriName, out uriResult) &&
                                 (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps))
                        {
                            Images.Add(uriResult);
                            outText.Write(imageLinkHighlight + "[" + Images.Count + "] " + resetColor);
                        }
                    }
                    break;

                case "strong":
                    outText.Write(" " + linkTextHighlight);
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText);
                    }
                    outText.Write(resetColor + " ");
                    skip = true;
                    break;

                case "a":
                    outText.Write(linkTextHighlight + " [Link:");
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText);
                    }
                    outText.Write("]" + resetColor);
                    if (node.Attributes.Contains("href"))
                    {
                        var uriName = node.Attributes["href"].Value;
                        Uri uriResult;
                        if (Uri.TryCreate(uriName, UriKind.Absolute, out uriResult) &&
                            (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps))
                        {
                            Links.Add(uriResult);
                            outText.Write(linkHighlight + "[" + (Links.Count + LinkStartFrom) + "] " + resetColor);
                        }
                        else if (Uri.TryCreate(BaseUri, uriName, out uriResult) &&
                                 (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps))
                        {
                            Links.Add(uriResult);
                            outText.Write(linkHighlight + "[" + (Links.Count + LinkStartFrom) + "] " + resetColor);
                        }
                    }
                    skip = true;
                    break;

                case "i":
                    outText.Write(" ");
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText);
                    }
                    outText.Write(" ");
                    skip = true;
                    break;
                }

                if (!skip)
                {
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText);
                    }
                }
                break;
            }
        }