public Dictionary <string, MyPurdueSection> ParseHtml(string content) { // Prepare section list var sections = new Dictionary <string, MyPurdueSection>(); MyPurdueSection section = null; // Check if we didn't return any classes if (content.Contains("No classes were found that meet your search criteria")) { return(sections); } HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); HtmlNode docRoot = document.DocumentNode; HtmlNodeCollection sectionNodes = docRoot.SelectNodes("/html/body/div[@class='pagebodydiv'][1]/table[@class='datadisplaytable'][1]/tr[ not ( th ) ]"); if (sectionNodes == null) { throw new ApplicationException("Could not parse data from section details request."); } // Loop through table rows for (var i = 0; i < sectionNodes.Count; i++) { var node = sectionNodes[i]; var crnNode = node.SelectSingleNode("td[2]"); if (crnNode == null) { continue; // No node? Skip... } // Each row is a section AND/OR meeting. // If there's a CRN in this row, it means that we're looking at a new section. if (HtmlEntity.DeEntitize(crnNode.InnerText).Trim().Length > 0) { // Section w/ primary meeting data var crnNumber = HtmlEntity.DeEntitize(crnNode.InnerText).Trim(); section = new MyPurdueSection() { Crn = crnNumber, SubjectCode = HtmlEntity.DeEntitize(node.SelectSingleNode("td[3]").InnerText).Trim(), Number = HtmlEntity.DeEntitize(node.SelectSingleNode("td[4]").InnerText).Trim(), SectionCode = HtmlEntity.DeEntitize(node.SelectSingleNode("td[5]").InnerText).Trim(), CampusCode = HtmlEntity.DeEntitize(node.SelectSingleNode("td[6]").InnerText).Trim(), Title = HtmlEntity.DeEntitize(node.SelectSingleNode("td[8]").InnerText).Trim(), Capacity = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[11]").InnerText).Trim()), Enrolled = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[12]").InnerText).Trim()), RemainingSpace = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[13]").InnerText).Trim()), WaitlistCapacity = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[14]").InnerText).Trim()), WaitlistCount = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[15]").InnerText).Trim()), WaitlistSpace = Int32.Parse(HtmlEntity.DeEntitize(node.SelectSingleNode("td[16]").InnerText).Trim()), Type = HtmlEntity.DeEntitize(node.SelectSingleNode("td[23]").InnerText).Trim(), Description = HtmlEntity.DeEntitize(node.SelectSingleNode("td[26]").InnerText).Trim(), Meetings = new List <MyPurdueMeeting>() }; // Deal with credit hours... var credits = HtmlEntity.DeEntitize(node.SelectSingleNode("td[7]").InnerText).Trim(); if (credits.Contains("-")) { credits = credits.Substring(credits.IndexOf("-") + 1); } else if (credits.Contains("/")) { credits = credits.Substring(credits.IndexOf("/") + 1); } section.CreditHours = double.Parse(credits); sections.Add(crnNumber, section); } // Now, update meeting data for this row var meeting = new MyPurdueMeeting(); // Update meeting days of the week // Parse days of week var daysOfWeek = HtmlEntity.DeEntitize(node.SelectSingleNode("td[9]").InnerText).Trim(); meeting.DaysOfWeek = ParseUtility.ParseDaysOfWeek(daysOfWeek); // Parse times var times = HtmlEntity.DeEntitize(node.SelectSingleNode("td[10]").InnerText).Trim(); var startEndTimes = ParseUtility.ParseStartEndTime(times, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time")); // TODO: Not hard-code time zone meeting.StartTime = startEndTimes.Item1; meeting.EndTime = startEndTimes.Item2; // Parse dates (removed - no year present, not reliable) //var dates = HtmlEntity.DeEntitize(node.SelectSingleNode("td[21]").InnerText); //var startEndDates = ParseUtility.ParseStartEndDate(dates, TimeZoneInfo.FindSystemTimeZoneById("Eastern Standard Time")); // TODO: Not hard-code time zone //meeting.StartDate = startEndDates.Item1; //meeting.EndDate = startEndDates.Item2; // Update meeting location (building short name) var loc = HtmlEntity.DeEntitize(node.SelectSingleNode("td[22]").InnerText).Trim(); if (loc.Equals("TBA")) { meeting.BuildingCode = "TBA"; meeting.BuildingName = "TBA"; meeting.RoomNumber = "TBA"; } else if (loc.Length > 0) { if (loc.Contains(" ")) { meeting.BuildingCode = loc.Substring(0, loc.IndexOf(" ")).Trim(); meeting.RoomNumber = loc.Substring(loc.IndexOf(" ") + 1).Trim(); } else { meeting.BuildingCode = loc; meeting.RoomNumber = ""; } } else { throw new ApplicationException("Could not parse location data for section CRN " + section.Crn + "."); } // Updating meeting type meeting.Type = HtmlEntity.DeEntitize(node.SelectSingleNode("td[23]").InnerText).Trim(); // Add the meeting section.Meetings.Add(meeting); } return(sections); }
private static void ConvertTo(string startUrl, HtmlNode node, TextWriter outText) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(startUrl, node, outText); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Trim().Length > 0) { outText.WriteLine(HtmlEntity.DeEntitize(html)); } break; case HtmlNodeType.Element: switch (node.Name) { case "title": outText.WriteLine($"<u>{node.InnerText}</u>"); break; case "p": // treat paragraphs as crlf outText.Write("\r\n"); break; case "a": string linkTarget = node.GetAttributeValue("href", ""); if (!linkTarget.StartsWith("http")) { linkTarget = startUrl + linkTarget; } if (linkTarget != "") { outText.Write($"<exec cmd=\"!wb {linkTarget}\">{node.InnerText}</exec>"); } break; } if (node.HasChildNodes) { ConvertContentTo(startUrl, node, outText); } break; } }
public Idiom ParseIdiomPage(string relativeUrl) { var absoluteUrl = WikitionaryParser.WikitionaryRootUrl + relativeUrl; var document = _web.Load(absoluteUrl).DocumentNode; // delete all nodes for other sections than English var nodesToRemove = document.SelectNodes("//hr/following-sibling::*"); if (nodesToRemove != null) { foreach (var nodeToRemove in nodesToRemove) { nodeToRemove.Remove(); } } // name var name = HtmlEntity.DeEntitize(document.SelectSingleNode("//h1[@id='firstHeading']").InnerText.Trim()); // usages var usages = new List <Usage>(); var relevantUsageSections = document.SelectNodes("//h3/span[@class='mw-headline']"); if (relevantUsageSections != null) { foreach (var relevantUsageSection in relevantUsageSections.Where(s => !H3HeadLinesToIgnore.Contains(s.InnerText.Trim()))) { var olNode = relevantUsageSection.SelectSingleNode("./../following-sibling::ol"); var definitionsAndExamples = new List <DefinitionAndExamples>(); var defNodes = olNode.SelectNodes("./li"); foreach (var defNode in defNodes) { var clone = defNode.CloneNode(true); var childrenToRemove = clone.SelectNodes("./dl|./ul"); if (childrenToRemove != null) { foreach (var childToRemove in childrenToRemove) { clone.RemoveChild(childToRemove); } } var definition = clone.InnerText.Trim(); var examples = defNode.SelectNodes("./dl/dd") != null ? defNode.SelectNodes("./dl/dd").Select(exNode => HtmlEntity.DeEntitize(exNode.InnerText.Trim())).ToList() : new List <string>(); var quotes = defNode.SelectNodes("./ul/li//dd") != null ? defNode.SelectNodes("./ul/li//dd").Select(ddNode => HtmlEntity.DeEntitize(ddNode.InnerText.Trim())).ToList() : new List <string>(); definitionsAndExamples.Add(new DefinitionAndExamples() { Definition = definition, Examples = examples, Quotes = quotes }); } var usage = new Usage() { DefinitionsAndExamples = definitionsAndExamples, PartOfSpeech = HtmlEntity.DeEntitize(relevantUsageSection.InnerText.Trim()) }; usages.Add(usage); } } // synonyms var synonyms = new List <string>(); var syonoymNodes = document.SelectNodes("//span[@id='Synonyms']/../following-sibling::ul//a"); if (syonoymNodes != null) { synonyms = syonoymNodes .Select(a => HtmlEntity.DeEntitize(a.InnerText.Trim())) .ToList(); } // Categories var categories = document.SelectNodes("//div[@id='mw-normal-catlinks']/ul/li/a") .Select(n => n.InnerText.Trim()) .ToList(); return(new Idiom() { Name = name, Synonyms = synonyms, SourceRelativeUrl = relativeUrl, Categories = categories, Usages = usages }); }
private static void ConvertTo(HtmlNode node, TextWriter outText) { switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text string html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces var trimmedHtml = html .Trim('\n') .Trim('\r') .Trim(); if (trimmedHtml.Length > 0) { string deEntitized; try { deEntitized = HtmlEntity.DeEntitize(trimmedHtml.ToLowerInvariant()); } catch (Exception) { deEntitized = "?"; } outText.Write(deEntitized); outText.Write(" "); } break; case HtmlNodeType.Element: switch (node.Name) { case "p": // treat paragraphs as crlf outText.Write("\r\n"); break; } if (node.HasChildNodes) { ConvertContentTo(node, outText); } break; } }
public async Task <IList <Beer> > Fetch() { var beers = new List <Beer>(); Console.WriteLine("Scraping Cloudwater FFB..."); Console.WriteLine(); var client = new HttpClient(); var breweriesPage = await ScrapeHelper.FetchParseAsync(""); var breweryNodes = breweriesPage.QuerySelectorAll("h2 > a"); var breweryCount = breweryNodes.Count(); Console.WriteLine($"Found {breweryCount} breweries..."); Console.WriteLine(); foreach (var breweryNode in breweryNodes) { var href = breweryNode.Attributes["href"]?.Value; if (string.IsNullOrWhiteSpace(href)) { continue; } var breweryName = CultureInfo.CurrentCulture.TextInfo.ToTitleCase( breweryNode.InnerText.Trim()?.ToLower() ); Console.WriteLine(breweryName); Console.WriteLine("------------------------------"); var beersPage = await ScrapeHelper.FetchParseAsync(href); var beerNodes = beersPage.QuerySelectorAll("ul > li > p"); if (beerNodes.Count() == 0) { Console.WriteLine("No beers found (yet)"); } else { foreach (var beerNode in beerNodes) { string beerName = null; string description = null; // Style: <strong>beer name</strong> description <strong>(v)</strong> if (beerNode.ChildNodes[0].Name?.ToLower() == "strong") { beerName = beerNode.ChildNodes[0]?.InnerText?.Trim(); description = beerNode.ChildNodes[1]?.InnerText?.Trim(); } // Style: beer name, description <strong>(v)</strong> else { var parts = beerNode.ChildNodes[0]?.InnerText?.Trim()?.Split(','); beerName = parts[0]?.Trim(); description = string.Join(',', parts.Skip(1)); } description = description?.TrimStart(' ', ','); Console.WriteLine($"{beerName} ----- {description}"); beers.Add(new Beer { BreweryName = breweryName != null ? HtmlEntity.DeEntitize(breweryName) : null, BeerName = beerName != null ? HtmlEntity.DeEntitize(beerName) : null, Description = description != null ? HtmlEntity.DeEntitize(description) : null }); } } Console.WriteLine(); await Task.Delay(1000); } Console.WriteLine($"Found {beers.Count} beers."); return(beers); }
public List <ListTemp> scrap( shelter_id) { HtmlWeb web_page = new HtmlWeb(); List <ListTemp> listTemp = new List <ListTemp>(); DateTime today = DateTime.Today; string _name = ""; string _description = ""; string _breed = ""; string _gender = ""; string _age = ""; string _weight = ""; DateTime _dateStart = DateTime.Today; string url = @""; var doc = web_page.Load(url); int number = 1; bool allList = false; while (!allList) { var nextPage = doc.DocumentNode.SelectNodes("//div[@class = 'pagination']/a[@class = 'next']"); number++; url = "" + number; var animal_link = doc.DocumentNode.SelectNodes("//a[@class = 'animals_btn_list_more']/@href").Select(q => q.GetAttributeValue("href", null)).ToList(); for (int i = 0; i < animal_link.Count(); i++) { List <byte[]> _photo = new List <byte[]>(); var animal_doc = web_page.Load(@"" + animal_link[i]); //--INFO var nodeInfo = animal_doc.DocumentNode.SelectNodes("//div[@class = 'info']")[0].InnerText.Replace("\r", "").Replace("\n", "").Trim(); nodeInfo = HtmlEntity.DeEntitize(nodeInfo).Trim(); var tempInfo = nodeInfo.Split(':'); for (int t = 0; t < tempInfo.Count(); t++) { if (tempInfo[t].Contains("Gatunek")) { _name = tempInfo[t].Replace("Gatunek", "").Trim(); } if (tempInfo[t].Contains("Płeć")) { _breed = tempInfo[t].Replace("Płeć", "").Trim(); } if (tempInfo[t].Contains("Wiek")) { _gender = tempInfo[t].Replace("Wiek", "").Trim(); } if (tempInfo[t].Contains("Waga")) { _age = tempInfo[t].Replace("Waga", "").Replace("lat", "").Replace("rok", "").Trim(); } if (tempInfo[t].Contains("Data przyjęcia")) { _weight = tempInfo[t].Replace("Data przyjęcia", "").Trim(); } if (tempInfo[t].Contains("ewidencyjny")) { var year = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[2]); var month = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[1]); var day = int.Parse(tempInfo[t].Replace("Nr ewidencyjny", "").Trim().Split('.')[0]); _dateStart = new DateTime(year, month, day); } }//--INFO //--Description _description = ""; var nodeDescription = animal_doc.DocumentNode.SelectNodes("//div[@class = 'description']").Select(q => q.InnerText).ToList(); for (int d = 0; d < nodeDescription.Count(); d++) { _description += nodeDescription[d]; } _description = HtmlEntity.DeEntitize(_description).Replace("\r", " ").Replace("\n", "").Trim(); //--Description //--Photo var node_Photo = animal_doc.DocumentNode.SelectNodes("//div[@class = 'ani_images']/div[@class = 'ani_image_bottom']/a"); if (node_Photo != null) { var nodePhoto = node_Photo.Select(q => q.GetAttributeValue("href", null)).ToList(); var photoLink = @""; for (int p = 0; p < nodePhoto.Count(); p++) { using (var client = new WebClient()) { _photo.Add(client.DownloadData(photoLink + nodePhoto[p])); } if (p == 4) { break; } } } //--Photo listTemp.Add(new ListTemp() { name = _name, breed = _breed, gender = _gender, age = _age, weight = _weight, description = _description, dateStart = _dateStart, shelter_ID = shelter_id.ID, photo = _photo, }); } doc = web_page.Load(url); if (nextPage == null) { allList = true; } } return(listTemp); }
public static string GetNextUri(HtmlDocument currentPage, string messageText) { var a = currentPage.DocumentNode.SelectNodes("//a").Where(x => string.Equals(messageText, HtmlEntity.DeEntitize(x.InnerText).Trim(), StringComparison.InvariantCultureIgnoreCase)) .First(); string uri = a.GetAttributeValue("href", def: null); uri = "" + uri; return(uri); }
/// <summary> /// Parses multiple <see cref="HtmlTag" />s from the given <paramref name="htmlContent" /> /// </summary> /// <param name="htmlContent">The html content</param> /// <param name="validateSyntax">A value indicating whether the html should be checked for syntax errors.</param> /// <returns>A collection of <see cref="HtmlTag" /></returns> /// <exception cref="InvalidOperationException"> /// If <paramref name="validateSyntax" /> is true and syntax errors are /// encountered in the <paramref name="htmlContent" /> /// </exception> public static IEnumerable <IHtmlElement> ParseAll(IHtmlContent htmlContent, bool validateSyntax = false) { if (htmlContent == null) { throw new ArgumentNullException(nameof(htmlContent)); } // special case: html content is already an HtmlTag! if (htmlContent is HtmlTag alreadyHtmlTag) { return(new[] { alreadyHtmlTag }); } // special case: string that may contain HTML but must be encoded when writing if (htmlContent is StringHtmlContent s) { return(new[] { new HtmlText(s) }); } // special case: TagBuilder if (htmlContent is TagBuilder tagBuilder) { var htmlTag = new HtmlTag(tagBuilder.TagName) .WithTagRenderMode(tagBuilder.TagRenderMode); if (tagBuilder.Attributes.Any()) { htmlTag = tagBuilder.Attributes .Aggregate(htmlTag, (tag, attribute) => tag.Attribute(attribute.Key, HtmlEntity.DeEntitize(attribute.Value))); } if (tagBuilder.HasInnerHtml) { htmlTag = htmlTag.WithContents(ParseAll(tagBuilder.InnerHtml, validateSyntax).ToImmutableList()); } return(new[] { htmlTag }); } return(ParseAll(htmlContent.ToHtmlString(), validateSyntax)); }
/**************************************************************************/ public static string MakeUrlAbsolute( string BaseUrl, string Url ) { string UrlFixed; Uri BaseUri = null; string BaseUriPort = ""; Uri NewUri = null; Regex reHTTP = new Regex("^https?:"); Regex reDoubleSlash = new Regex("^//"); Regex reSlash = new Regex("^/"); Regex reQuery = new Regex("^\\?"); Regex reHash = new Regex("^#"); Regex reUnsupportedScheme = new Regex("^[^:]+:"); BaseUrl = HtmlEntity.DeEntitize(BaseUrl); BaseUrl = Uri.UnescapeDataString(BaseUrl); Url = HtmlEntity.DeEntitize(Url); Url = Uri.UnescapeDataString(Url); try { BaseUri = new Uri(BaseUrl, UriKind.Absolute); if (BaseUri.Port > 0) { BaseUriPort = string.Format(":{0}", BaseUri.Port); } } catch (UriFormatException ex) { DebugMsgStatic(string.Format("MakeUrlAbsolute: {0}", ex.Message)); } catch (Exception ex) { DebugMsgStatic(string.Format("MakeUrlAbsolute: {0}", ex.Message)); } if (BaseUri == null) { throw new MacroscopeUriFormatException("Malformed Base URI"); } if (!Regex.IsMatch(Url, "^(https?:|/|#)")) { DebugMsgStatic(string.Format("STRANGE URL: 1: {0}", BaseUrl)); DebugMsgStatic(string.Format("STRANGE URL: 2: {0}", Url)); } if (!reHTTP.IsMatch(Url)) { bool IsSuspect = false; if ( (!reDoubleSlash.IsMatch(Url)) && (!reSlash.IsMatch(Url)) && (!reQuery.IsMatch(Url)) && (!reHash.IsMatch(Url))) { if (reUnsupportedScheme.IsMatch(Url)) { IsSuspect = true; } } if (IsSuspect) { DebugMsgStatic(string.Format("STRANGE URL: IS SUSPECT: {0}", Url)); return(null); } } if (reDoubleSlash.IsMatch(Url)) { try { NewUri = new Uri( string.Format( "{0}:{1}", BaseUri.Scheme, Url ), UriKind.Absolute ); } catch (InvalidOperationException ex) { DebugMsgStatic(ex.Message); } catch (UriFormatException ex) { DebugMsgStatic(ex.Message); } } else if (reSlash.IsMatch(Url)) { try { NewUri = new Uri( string.Format( "{0}://{1}{2}{3}", BaseUri.Scheme, BaseUri.Host, BaseUriPort, Url ), UriKind.Absolute ); } catch (InvalidOperationException ex) { DebugMsgStatic(ex.Message); } catch (UriFormatException ex) { DebugMsgStatic(ex.Message); } } else if (reQuery.IsMatch(Url)) { try { NewUri = new Uri( string.Format( "{0}://{1}{2}{3}{4}", BaseUri.Scheme, BaseUri.Host, BaseUriPort, BaseUri.AbsolutePath, Url ), UriKind.Absolute ); } catch (InvalidOperationException ex) { DebugMsgStatic(ex.Message); } catch (UriFormatException ex) { DebugMsgStatic(ex.Message); } } else if (reHash.IsMatch(Url)) { string NewUrl = Url; Regex reHashRemove = new Regex("#.*$", RegexOptions.Singleline); NewUrl = reHashRemove.Replace(NewUrl, ""); try { NewUri = new Uri( string.Format( "{0}://{1}{2}{3}", BaseUri.Scheme, BaseUri.Host, BaseUriPort, NewUrl ), UriKind.Absolute ); } catch (InvalidOperationException ex) { DebugMsgStatic(ex.Message); } catch (UriFormatException ex) { DebugMsgStatic(ex.Message); } } else if (reHTTP.IsMatch(Url)) { try { NewUri = new Uri(Url, UriKind.Absolute); } catch (InvalidOperationException ex) { DebugMsgStatic(ex.Message); } catch (UriFormatException ex) { DebugMsgStatic(ex.Message); } } else if (reUnsupportedScheme.IsMatch(Url)) { ; // NO-OP, for now. } else { DebugMsgStatic(string.Format("RELATIVE URL 1: {0}", Url)); string BasePath = Regex.Replace(BaseUri.AbsolutePath, "/[^/]+$", "/"); string NewPath = string.Join("", BasePath, Url); DebugMsgStatic(string.Format("RELATIVE URL 2: {0}", BasePath)); DebugMsgStatic(string.Format("RELATIVE URL 3: {0}", NewPath)); try { NewUri = new Uri( string.Format( "{0}://{1}{2}{3}", BaseUri.Scheme, BaseUri.Host, BaseUriPort, NewPath ), UriKind.Absolute ); } catch (InvalidOperationException ex) { DebugMsgStatic(ex.Message); } catch (UriFormatException ex) { DebugMsgStatic(ex.Message); } } if (NewUri != null) { UrlFixed = NewUri.ToString(); } else { UrlFixed = Url; } UrlFixed = SanitizeUrl(UrlFixed); return(UrlFixed); }
public async Task <IEnumerable <PublicTransportTrip> > SearchAsync(TripSearchRequest request) { var client = new HttpClient(); var startResponse = (await client.GetAsync(options.Host)) .EnsureSuccessStatusCode(); var startHtml = new HtmlDocument(); startHtml.LoadHtml(await startResponse.Content.ReadAsStringAsync()); var formNode = startHtml.DocumentNode.SelectSingleNode("//form[@id='HFSQuery']"); var formAction = formNode.Attributes["action"].Value; int queryId = int.Parse(queryIdRegex.Match(formAction).Groups["id"].Value); var requestContent = options.BuildSearch(request); var response = (await client.PostAsync( options.SearchURI(queryId), new FormUrlEncodedContent(requestContent))) .EnsureSuccessStatusCode(); var resultHtml = new HtmlDocument(); resultHtml.LoadHtml(await response.Content.ReadAsStringAsync()); var resultTable = resultHtml.DocumentNode.SelectSingleNode("//table[@class='resultTable']"); if (resultTable == null) { throw new InvalidOperationException("No trips could be found!"); } var result = new List <PublicTransportTrip>(); foreach (var row in resultTable.SelectNodes("//tr[starts-with(@id, 'trOverview') and not(starts-with(@id, 'trOverviewHint') )]")) { var builder = PublicTransportTrip.NewBuilder(Operator); DateTime startDate = DateTime.MinValue; TimeSpan startTime = TimeSpan.MinValue; DateTime endDate = DateTime.MinValue; TimeSpan endTime = TimeSpan.MinValue; foreach (var column in row.SelectNodes("td")) { switch (column.Attributes["headers"]?.Value) { case "hafasOVStop": var startStop = GetTextContent(column.FirstChild); var endStop = GetTextContent(column.LastChild); builder.SetStartLocation(new Destination(startStop, 0, 0)); builder.SetEndLocation(new Destination(endStop, 0, 0)); break; case "hafasOVDate": startDate = DateTime.ParseExact(column.InnerText, "dd.MM.yyyy", CultureInfo.InvariantCulture); endDate = startDate; // TODO Could go over 2 days break; case "hafasOVTime": var planned = column.SelectSingleNode("div/div[@class='planed']").InnerText; string[] lines = planned.Trim().Split('\n'); startTime = TimeSpan.Parse(lines[0].Replace("ab", "")); endTime = TimeSpan.Parse(lines[1].Replace("an", "")); break; case "hafasOVDuration": var duration = TimeSpan.Parse(column.InnerText); break; case "hafasOVChanges": break; case "hafasOVProducts": builder.AddType(PublicTransportType.Train); // TODO break; default: break; } } builder.SetStartTime(startDate + startTime); builder.SetEndTime(endDate + endTime); result.Add(builder.Build()); } return(result); string GetTextContent(HtmlNode node) { string text = node.InnerText; text = HtmlEntity.DeEntitize(text); return(text.Trim()); } }
private string DecodeValue(string value) { return(HtmlEntity.DeEntitize(value)); }
public string exchangeStrings(string org) { string ret; byte[] toAry = new byte[2]; string to; StringBuilder sb = new StringBuilder(HtmlEntity.DeEntitize(org)); foreach (structExchangeTable table in exchangeTable) { if (table.orig.Length == 0) { continue; } toAry[0] = (byte)((table.code >> 8) & 0xff); if (toAry[0] != 0) { toAry[1] = (byte)(table.code & 0xff); } else { toAry[0] = (byte)(table.code & 0xff); toAry[1] = 0; } to = System.Text.Encoding.GetEncoding(932).GetString(toAry); sb.Replace(table.orig, to); } ret = sb.ToString(); // [HV] HDTV E0F8 F2CE // [SD] SDTV E0F9 F2CF // [手]手話通訳放送 E0FD F23D3 // [字]字幕放送 E0FE F2D4 // [双]双方向放送 E0FF F2D5 // [デ]番組連動データ放送 E180 F2D6 // [S]ステレオ放送 E181 F2D7 // [二]二ヶ国語放送 E182 F2D8 // [多]音声多重放送 E183 F2D9 // [解]音声解説 E184 F2DA // [SS] サラウンドステレオ E185 F2DB // [B] 圧縮Bモードステレオ E186 D2DC // [N] ニュース E187 F2DD // [天] 天気予報 E18A F2E0 // [交] 交通情報 E18B F2E1 // [映] 劇場映画 E18C F2E2 // [料] 有料放送 E18D F2E4 // [前] 前編 E190 F2E6 // [後] 後編 E191 F2E7 // [再] 再放送 E192 F2E8 // [新] 新番組 E193 F2E9 // [初] 初回放送 E194 F2EA // [終] 最終回 E195 F2EB // [生] 生放送 E196 F2EC // [PV] ペーパービュー E19A F2F0 // {"[HV]",{0xE0,0xF8}}, //{"[SD]",{0xE0,0xF9} //"[手]",0xE0,0xFD //"[字]",0xE0,0xFE //"[双]",0xE0,0xFF //"[デ]",0xE1,0x80 //"[S]",0xE1,0x81 //"[二]",0xE1,0x82 //"[多]",0xE1,0x83 //"[解]",0xE1,0x84 //"[SS]",0xE1,0x85 //"[B]",0xE1,0x86 //"[N]",0xE1,0x87 //"[天]",0xE1,0x8A //"[交]",0xE1,0x8B //"[映]",0xE1,0x8C //"[料]",0xE1,0x8D //"[前]",0xE1,0x90 //"[後]",0xE1,0x91 //"[再]",0xE1,0x92 //"[新]",0xE1,0x93 //"[初]",0xE1,0x94 //"[終]",0xE1,0x95 //"[生]",0xE1,0x96 //"[PV]",0xE1,0x9A // 現在のHTML界隈で使われることがある HTML Entities 群。 	 タブ 
 復帰   スペース ! ! 感嘆符 " " " ダブルクォーテーション //# # シャープ $ $ ドル % % パーセント & & & アンパサンド ' ' アポストロフィ ( ( 開く括弧 ) ) 閉じる括弧 * * アスタリスク + + プラス , , コンマ - - ハイフン . . ピリオド / / スラッシュ 0~9 0~9 数字   改行されないスペース : : コロン ; ; セミコロン < < < 小なり = = イコール > > > 大なり ? ? 疑問符 @ @ アットマーク A~Z A~Z 大文字アルファペット
private static void ConvertTo(HtmlNode node, TextWriter outText) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output, also don't want titles (strong) string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style") || (parentName == "strong")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Trim().Length > 0) { outText.Write(HtmlEntity.DeEntitize(html)); } break; case HtmlNodeType.Element: switch (node.Name) { // treat paragraphs as crlf case "p": outText.Write("\r\n"); break; // respect NewLine aswell case "br": if (tries < maxTries) // set max tries { tries++; outText.Write(Environment.NewLine); } break; } if (node.HasChildNodes) { ConvertContentTo(node, outText); } break; } }
public static string DeEntitize(this string text) => string.IsNullOrEmpty(text) ? null : HtmlEntity.DeEntitize(text);
public MangaObject ParseMangaObject(string content) { HtmlDocument MangaObjectDocument = new HtmlDocument(); MangaObjectDocument.LoadHtml(content); HtmlNode InformationNode = MangaObjectDocument.DocumentNode.SelectSingleNode("//div[contains(@class,'ipsBox')]/div"); String Cover = InformationNode.SelectSingleNode(".//div[1]/img").Attributes["src"].Value; HtmlNode MangaProperties = InformationNode.SelectSingleNode(".//table[contains(@class,'ipb_table')]"), ChapterListing = MangaObjectDocument.DocumentNode.SelectSingleNode("//table[contains(@class,'chapters_list')]"); String MangaName = HtmlEntity.DeEntitize(MangaObjectDocument.DocumentNode.SelectSingleNode("//h1[contains(@class,'ipsType_pagetitle')]").InnerText.Trim()), MangaTypeProp = HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[5]/td[2]").InnerText), Desciption = HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[7]/td[2]").InnerText.Replace("<br>", "\n")); MangaObjectType MangaType = MangaObjectType.Unknown; FlowDirection PageFlowDirection = FlowDirection.RightToLeft; switch (MangaTypeProp.ToLower()) { default: MangaType = MangaObjectType.Unknown; PageFlowDirection = FlowDirection.RightToLeft; break; case "manga (japanese)": MangaType = MangaObjectType.Manga; PageFlowDirection = FlowDirection.RightToLeft; break; case "manhwa (korean)": MangaType = MangaObjectType.Manhwa; PageFlowDirection = FlowDirection.LeftToRight; break; case "manhua (chinese)": MangaType = MangaObjectType.Manhua; PageFlowDirection = FlowDirection.LeftToRight; break; } HtmlNodeCollection AlternateNameNodes = MangaProperties.SelectSingleNode(".//tr[1]/td[2]").SelectNodes(".//span"), GenreNodes = MangaProperties.SelectSingleNode(".//tr[4]/td[2]").SelectNodes(".//a/span"); String[] AlternateNames = { }, Authors = { HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[2]/td[2]/a").InnerText) }, Artists = { HtmlEntity.DeEntitize(MangaProperties.SelectSingleNode(".//tr[3]/td[2]/a").InnerText) }, Genres = { }; if (AlternateNameNodes != null && AlternateNameNodes.Count > 0) { AlternateNames = (from HtmlNode AltNameNode in AlternateNameNodes select HtmlEntity.DeEntitize(AltNameNode.InnerText.Trim())).ToArray(); } if (GenreNodes != null && GenreNodes.Count > 0) { Genres = (from HtmlNode GenreNode in GenreNodes select HtmlEntity.DeEntitize(GenreNode.InnerText.Trim())).ToArray(); } List <ChapterObject> Chapters = new List <ChapterObject>(); HtmlNodeCollection ChapterNodes = ChapterListing.SelectNodes(String.Format(".//tr[contains(@class,'lang_{0} chapter_row')]", ExtensionDescriptionAttribute.Language)); if (ChapterNodes != null && ChapterNodes.Count > 0) { foreach (HtmlNode ChapterNode in ChapterNodes) { HtmlNode VolChapNameNode = ChapterNode.SelectSingleNode("td[1]/a"); Match VolChapMatch = Regex.Match(VolChapNameNode.InnerText, @"(Vol\.(?<Volume>\d+)\s)?(Ch\.(?<Chapter>\d+))(\.(?<SubChapter>\d+))?"); String ChapterName = VolChapNameNode.InnerText.Substring(VolChapMatch.Length + 2).Trim(), ReleaseData = ReleaseData = ChapterNode.SelectSingleNode("td[5]").InnerText; ChapterObject PrevChapter = Chapters.LastOrDefault(); UInt32 Volume = 0, Chapter = 0, SubChapter = 0; if (VolChapMatch.Groups["Volume"].Success) { UInt32.TryParse(VolChapMatch.Groups["Volume"].Value, out Volume); } if (VolChapMatch.Groups["Chapter"].Success) { UInt32.TryParse(VolChapMatch.Groups["Chapter"].Value, out Chapter); } if (VolChapMatch.Groups["SubChapter"].Success) { UInt32.TryParse(VolChapMatch.Groups["SubChapter"].Value, out SubChapter); } DateTime Released = DateTime.Now; if (ReleaseData.Contains("-")) { ReleaseData = ReleaseData.Split(new String[] { " - " }, StringSplitOptions.RemoveEmptyEntries)[0]; DateTime.TryParseExact(ReleaseData, "dd MMMM yyyy", CultureInfo.InvariantCulture, DateTimeStyles.None, out Released); } else if (ReleaseData.EndsWith("ago")) { String[] ReleaseDataParts = ReleaseData.Split(new Char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); Double Offset = 1; if (!Double.TryParse(ReleaseDataParts[0], out Offset)) { Offset = 1; } Offset *= -1; switch (ReleaseDataParts[1].ToLower()) { default: case "seconds": Released = Released.AddSeconds(Offset); break; case "minutes": Released = Released.AddMinutes(Offset); break; case "hours": Released = Released.AddHours(Offset); break; case "days": Released = Released.AddDays(Offset); break; case "weeks": Released = Released.AddDays(7 * Offset); break; } } String ChapterUrl = VolChapNameNode.Attributes["href"].Value; String ChapterHash = ChapterUrl.Split('#').Last().Split('_').First(); ChapterUrl = String.Format("{0}&p=1&supress_webtoon=t", ChapterHash); ChapterObject chapterObject = new ChapterObject() { Name = HtmlEntity.DeEntitize(ChapterName), Volume = Volume, Chapter = Chapter, SubChapter = SubChapter, Released = Released, Locations = { new LocationObject() { ExtensionName = ExtensionDescriptionAttribute.Name, ExtensionLanguage = ExtensionDescriptionAttribute.Language, Url = ChapterUrl } } }; if (!Chapters.Any(o => o.Chapter == chapterObject.Chapter && ((Int32)o.SubChapter - chapterObject.SubChapter).InRange(-4, 4))) { Chapters.Add(chapterObject); } else { Chapters.Find(o => o.Chapter == chapterObject.Chapter && ((Int32)o.SubChapter - chapterObject.SubChapter).InRange(-4, 4)).Merge(chapterObject); } } } Chapters.Reverse(); Double Rating = -1; try { HtmlNode RatingNode = MangaObjectDocument.DocumentNode.SelectSingleNode("//div[contains(@class,'rating')]"); String RatingText = new String(RatingNode.InnerText.Trim().Substring(1, 4).Where(IsValidRatingChar).ToArray()); Double.TryParse(RatingText, out Rating); } catch { } return(new MangaObject() { Name = MangaName, MangaType = MangaType, PageFlowDirection = PageFlowDirection, Description = HtmlEntity.DeEntitize(Desciption), AlternateNames = AlternateNames.ToList(), CoverLocations = { new LocationObject() { Url = Cover, ExtensionName = ExtensionDescriptionAttribute.Name, ExtensionLanguage = ExtensionDescriptionAttribute.Language } }, Authors = Authors.ToList(), Artists = Artists.ToList(), Genres = Genres.ToList(), Released = (Chapters.FirstOrDefault() ?? new ChapterObject()).Released, Chapters = Chapters, Rating = Rating }); }
private static void ParseSharliInput(string input) { // Setting up the base nodes : root, personal bar, tags, unfiltered, base Node baseNode = new Node(null, "", 1, null, DateTime.Now.Ticks, DateTime.Now.Ticks, null, Node.TypeMozPlaceContainer, "placesRoot", new List <Node>()); baseNode.Children.Add(new Node(null, "Menu des marques-pages", 2, 1, DateTime.Now.Ticks, DateTime.Now.Ticks, null, Node.TypeMozPlaceContainer, "bookmarksMenuFolder", null)); baseNode.Children.Add(new Node(1, "Barre Personnelle", 3, 1, DateTime.Now.Ticks, DateTime.Now.Ticks, null, Node.TypeMozPlaceContainer, "toolbarFolder", null)); baseNode.Children.Add(new Node(2, "Étiquettes", 4, 1, DateTime.Now.Ticks, DateTime.Now.Ticks, null, Node.TypeMozPlaceContainer, "tagsFolder", new List <Node>())); baseNode.Children.Add(new Node(3, "Marque-pages non classés", 5, 1, DateTime.Now.Ticks, DateTime.Now.Ticks, null, Node.TypeMozPlaceContainer, "unfiledBookmarksFolder", new List <Node>())); // Getting the HTML in a better state, loading it into a HTMLDocument HtmlDocument html = new HtmlDocument(); FileInfo filePath = new FileInfo(input); try { using (System.IO.StreamReader sr = new System.IO.StreamReader(filePath.FullName)) { string strHtml = sr.ReadToEnd(); strHtml = strHtml.Replace("</A>", "</A></DT>"); // Parsing HTML with a Regex. // Might summon Zalgo. // This finds the malformed <DD> tags (not closed). var regex = new System.Text.RegularExpressions.Regex(@"(?<!\>)\n<DT>"); strHtml = regex.Replace(strHtml, delegate(System.Text.RegularExpressions.Match m) { return("</DD>" + m.Value); }); // Fixing html entities strHtml = HtmlEntity.DeEntitize(strHtml); html.LoadHtml(strHtml); } } catch (IOException ex) { Console.WriteLine("Failure : File does not exist : " + input); return; } Console.WriteLine(html.DocumentNode.ChildNodes.Count(node => String.Equals(node.Name, "DT", StringComparison.InvariantCultureIgnoreCase)) + " links found."); HtmlNodeCollection nodes = html.DocumentNode.ChildNodes; /* Foreach node (tag) in the HTML : * * Look up its name * * If it is a <DT> tag, we found a link * * Look up its child nodes for an <A> tag * * Look up if it has an URL starting with '?', it is a Sharli link, and we ignore it. * * Set the Name as what is linked in the <A> tag * * Look up if it is followed by a <DD> tag, its description */ for (int i = 0; i < nodes.Count; i++) { if (String.Equals(nodes[i].Name, "DT", StringComparison.InvariantCultureIgnoreCase) && i + 2 < nodes.Count) { Node annotation = null; // Look up if the next tag is a <DD> tag. If it is, it's this link's description // i+1 would point to a newline tag. (why would you even do that HAP ? if (String.Equals(nodes[i + 2].Name, "dd", StringComparison.InvariantCultureIgnoreCase)) { annotation = new Node("bookmarkProperties/description", 0, 4, null, 3, nodes[i + 2].InnerText); } // Shaarli URLs (pointing to itself) causes the bookmark import to fail. // We simply make them point to about:blank string url = nodes[i].ChildNodes[0].Attributes["HREF"].Value; if (url[0] == '?') { url = "about:blank"; } baseNode.Children[3].Children.Add(new Node(null, nodes[i].ChildNodes[0].InnerText, i, 5, DateTime.Now.Ticks, DateTime.Now.Ticks, "text/x-moz-place", url, annotation)); // If this tag has one children which is an <A> tag, it's a link. We get its target. if (HasATagChild(nodes[i], "a")) { HtmlNode currentNode = nodes[i].ChildNodes[0]; #region Getting TAGS // If this <A> tag has any "TAGS" attribute, they've been defined by the user, and we're getting them if (currentNode.Attributes.Any(a => String.Equals(a.Name, "TAGS", StringComparison.InvariantCultureIgnoreCase))) { string[] tags = currentNode.Attributes["TAGS"].Value.Split(','); foreach (string tag in tags) { // If there is already one children of the tags which has this tagname, we simply add one children to it if (baseNode.Children[2].Children.Any(c => c.Title == tag)) { Node targetNode = baseNode.Children[2] .Children .First(c => c.Title == tag); targetNode.Children.Add(new Node(null, "", targetNode.ID + targetNode.Children.Count, targetNode.ID, DateTime.Now.Ticks, DateTime.Now.Ticks, "text/x-moz-place", currentNode.Attributes["HREF"].Value, annotation)); } else { baseNode.Children[2].Children.Add(new Node(null, tag, i, 4, DateTime.Now.Ticks, DateTime.Now.Ticks, null, Node.TypeMozPlaceContainer, "", new List <Node>())); Node targetNode = baseNode.Children[2] .Children .First(c => c.Title.ToLower() == tag.ToLower()); targetNode.Children.Add(new Node(null, "", targetNode.ID + targetNode.Children.Count, targetNode.ID, DateTime.Now.Ticks, DateTime.Now.Ticks, "text/x-moz-place", currentNode.Attributes["HREF"].Value, annotation)); } } } #endregion } } } Serialize <Node>(baseNode, input + ".export.json"); Console.WriteLine("Sucess ! output written to " + input + ".export.json."); }
public List <SearchResultObject> ParseSearch(string content) { List <SearchResultObject> SearchResults = new List <SearchResultObject>(); Regex IdMatch = new Regex(@"r\d+"); HtmlDocument SearchResultDocument = new HtmlDocument(); SearchResultDocument.LoadHtml(content); HtmlWeb HtmlWeb = new HtmlWeb(); HtmlNodeCollection HtmlSearchResults = SearchResultDocument.DocumentNode.SelectNodes("//table[contains(@class,'ipb_table chapters_list')]/tbody/tr[not(contains(@class,'header'))]"); if (!Equals(HtmlSearchResults, null)) { foreach (HtmlNode SearchResultNode in HtmlSearchResults) { HtmlNode NameLink = SearchResultNode.SelectSingleNode(".//td[1]/strong/a"); if (NameLink != null) { Int32 Id = -1; String Name = HtmlEntity.DeEntitize(NameLink.InnerText).Trim(), Link = NameLink.Attributes["href"].Value, Description = null; LocationObject Cover = null; if (Int32.TryParse(IdMatch.Match(Link).Value.Substring(1), out Id)) { HtmlDocument PopDocument = HtmlWeb.Load(String.Format("{0}/comic_pop?id={1}", ExtensionDescriptionAttribute.RootUrl, Id)); HtmlNode CoverNode = PopDocument.DocumentNode.SelectSingleNode("//img"), DescriptionNode = PopDocument.DocumentNode.SelectSingleNode("//table/tbody/tr[6]/td[2]"); if (!HtmlNode.Equals(CoverNode, null)) { Cover = new LocationObject() { Url = CoverNode.Attributes["src"].Value, ExtensionName = ExtensionDescriptionAttribute.Name, ExtensionLanguage = ExtensionDescriptionAttribute.Language } } ; if (!HtmlNode.Equals(DescriptionNode, null)) { Description = DescriptionNode.InnerText.Trim(); } } String[] Author_Artists = { SearchResultNode.SelectSingleNode(".//td[2]").InnerText.Trim() }; SearchResults.Add(new SearchResultObject() { Cover = Cover, Description = Description, ExtensionName = ExtensionDescriptionAttribute.Name, ExtensionLanguage = ExtensionDescriptionAttribute.Language, Name = Name, Url = Link, Id = Id.ToString(), Rating = Double.Parse(SearchResultNode.SelectSingleNode(".//td[3]/div").Attributes["title"].Value.Substring(0, 4)), Artists = Author_Artists.ToList(), Authors = Author_Artists.ToList() }); } } } return(SearchResults); } }
/// <summary> /// Reduces the document. /// </summary> /// <param name="htmlInput">The HTML input.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public String ReduceDocument(String htmlInput) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.OptionFixNestedTags = true; htmlDocument.OptionAutoCloseOnEnd = true; htmlDocument.LoadHtml(htmlInput); List <HtmlNode> htmlNodes = htmlDocument.DocumentNode.ChildNodes.ToList(); var tagNameReplacements = settings.tagNameReplacement.GetDictionary(); var attWithValueToRemove = settings.attributeWithValueToRemove.GetDictionary(); // first phase while (htmlNodes.Any()) { List <HtmlNode> nextIteration = new List <HtmlNode>(); foreach (HtmlNode node in htmlNodes) { String nodeName = node.Name.ToLower(); if (tagNameReplacements.ContainsKey(nodeName)) { node.Name = tagNameReplacements[nodeName].value; nodeName = tagNameReplacements[nodeName].value; } if (settings.tagsToRemove.Contains(nodeName)) //.Any(x => x.Equals(node.Name, StringComparison.InvariantCultureIgnoreCase))) { node.Remove(); } else { Boolean addToNextIteration = true; if (settings.emptyTagsToRemove.Contains(nodeName)) //.Any(x=>x.Equals(node.Name, StringComparison.InvariantCultureIgnoreCase))) { { if (IsNodeEmpty(node)) { node.Remove(); addToNextIteration = false; } } if (settings.tagsToRemoveAllAttributes.Contains(nodeName)) //.Any(x=>x.Equals(node.Name, StringComparison.InvariantCultureIgnoreCase))) { { node.Attributes.RemoveAll(); } if (addToNextIteration) { nextIteration.Add(node); foreach (var attribute in node.Attributes.ToList()) { String attributeName = attribute.Name.ToLower(); if (settings.attributesToRemove.Contains(attributeName)) //.Any(x => x.Equals(attribute.Name, StringComparison.InvariantCultureIgnoreCase))) { attribute.Remove(); } if (attribute.Value.isNullOrEmpty()) { //attribute.Value = " "; //attribute.Remove(); } else { if (attWithValueToRemove.ContainsKey(attributeName)) { if (attribute.Value.toStringSafe() == attWithValueToRemove[attributeName].value) { attribute.Remove(); } } } } } } } htmlNodes = new List <HtmlNode>(); foreach (HtmlNode node in nextIteration) { htmlNodes.AddRange(node.ChildNodes.ToList()); } } String outputHtml = htmlDocument.DocumentNode.OuterHtml; if (settings.ReduceEmptySpace) { outputHtml = HtmlEntity.DeEntitize(outputHtml); outputHtml = REGEX_SELECTCOMMENTS.Replace(outputHtml, ""); outputHtml = REGEX_EMPTYSPACE.Replace(outputHtml, ">" + Environment.NewLine + "<"); outputHtml = outputHtml.Replace("><", ">" + Environment.NewLine + "<"); String doubleNewLine = Environment.NewLine + Environment.NewLine; Int32 i = 0; while (outputHtml.IndexOf(doubleNewLine) > 0) { outputHtml = outputHtml.Replace(doubleNewLine, Environment.NewLine); i++; if (i > 100) { break; } } } if (settings.RebuildHtml) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(outputHtml); StringBuilder sb = new StringBuilder(); RenderNode(document.DocumentNode, sb, 0); outputHtml = sb.ToString(); } if (settings.InsertReductionSignature) { String headerComment = RenderCommentNode("imbSCI.DataExtraction - Reduced HTML document"); outputHtml = headerComment + Environment.NewLine + outputHtml; } return(outputHtml); }
public void DeEntitize() { var html = @"mouse's house"; Assert.AreEqual("mouse's house", HtmlEntity.DeEntitize("mouse's house")); }
public static GrantItemInfo ToGrantInfo(this RawGrantItemInfo info) { var regTitle = new Regex(@"\[(?<type>.*)\] (?<title>.*)"); var titleMatch = regTitle.Match(info.Title); var title = titleMatch.Groups["title"].Value; var type = titleMatch.Groups["type"].Value; var docDetails = new HtmlDocument(); docDetails.LoadHtml(info.Details); var lis = docDetails.DocumentNode.CssSelect("li"); var details = new List <GrantDetailInfo>(); foreach (var li in lis) { if (string.IsNullOrWhiteSpace(li.InnerText)) { continue; } var text = string.Join("", li.ChildNodes.Select(_ => { if (_.NodeType == HtmlNodeType.Text) { return(_.InnerText); } else if (_.NodeType == HtmlNodeType.Element && _.Name == "div" && _.ChildNodes.Count == 1 && _.ChildNodes.First().NodeType == HtmlNodeType.Text) { return(_.ChildNodes.First().InnerText); } else { return(null); } })); text = HtmlEntity.DeEntitize(text); var segs = text.SplitTwo(":"); if (segs.Length == 1) { var last = details.Last(); segs = new[] { last.Name, string.Join(";", last.Values.Concat(new[] { segs[0] })) }; details.Remove(last); } else if (segs.Length > 2) { throw new Exception(); } var vals = segs[1] .Split(new[] { ";" }, StringSplitOptions.RemoveEmptyEntries) .Select(_ => _.Trim()) .ToArray(); details.Add(new GrantDetailInfo { Name = segs[0].Trim(), Values = vals }); } // parse description var docDesc = new HtmlDocument(); docDesc.LoadHtml(info.Description); var desc = docDesc.DocumentNode.InnerText; var leadingDesc = ""; if (desc.EndsWith("全部")) { desc = desc.Substring(0, desc.Length - 2).Trim(); } desc = HtmlEntity.DeEntitize(desc); var d = desc.SplitTwo(":"); leadingDesc = d[0].Trim(); desc = d[1].Trim(); details.Add(new GrantDetailInfo { Name = leadingDesc, Values = new[] { desc } }); // parse links var rePam = new Regex(@"javascript\:pam3\('(?<type>[piudg]{3})','(?<id>.+)','(?<index>\d?)'\);"); var reTx = new Regex(@"javascript\:sw_xx\('(?<number>.*)'\);"); var docLinks = new HtmlDocument(); docLinks.LoadHtml(info.Links); var links = docLinks.DocumentNode.CssSelect("span a") .Select(link => new { href = link.GetAttributeValue("href"), text = link.InnerText }) .Select(link => { var pamMatch = rePam.Match(link.href); if (pamMatch.Success) { var pamType = pamMatch.Groups["type"].Value; var pamId = pamMatch.Groups["id"].Value; var pamIndex = pamMatch.Groups["index"].Value; return((GrantItemLinkBase) new GrantItemPamLink { Title = link.text, Type = pamType, Id = pamId, Index = pamIndex, }); } var txMatch = reTx.Match(link.href); if (txMatch.Success) { var txNumber = txMatch.Groups["number"].Value; return(new GrantItemTxLink { Title = link.text, Number = txNumber, }); } throw new NotSupportedException("cannot parse link"); }) .ToArray(); var imageUrl = Regex.Replace(info.Image, "_thumb.jpg$", ".jpg"); return(new GrantItemInfo { Id = info.Id, Details = details.ToArray(), ThumbImage = info.Image, Image = imageUrl, Links = links, QrImage = info.QrImage, Title = title, Type = type, }); }
private static AnimeListData AnalyzeDocument(string HTMLCode, string link) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(HTMLCode); AnimeListData list = new AnimeListData(); try { foreach (var animeNode in doc.DocumentNode.Descendants("div").Where(x => x.GetAttributeValue("class", "") == "seasonal-anime js-seasonal-anime")) { CoreAnimeEntry page = new CoreAnimeEntry() { Title = Utility.FixString(HtmlEntity.DeEntitize(animeNode.Descendants("p").First(x => x.GetAttributeValue("class", "") == "title-text").InnerText)), AnimeLink = new LinkInfo(animeNode.Descendants("p").First(x => x.GetAttributeValue("class", "") == "title-text").Descendants("a").First().GetAttributeValue("href", "")), Episodes = Utility.FixString(HtmlEntity.DeEntitize(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "eps").InnerText)), ImageLink = new LinkInfo(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "image").Descendants().First().GetAttributeValue("src", "")), Synopsis = HtmlEntity.DeEntitize(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "synopsis js-synopsis").InnerText), }; if (animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "source").InnerText.Length > 1) { page.Source = (AnimeSourceType)Enum.Parse(typeof(AnimeSourceType), Utility.FixEnum(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "source").InnerText)); } LinkInfo prod = new LinkInfo() { Name = Utility.FixString(HtmlEntity.DeEntitize(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "producer").InnerText)) }; if (animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "producer").Descendants("a").Any()) { prod.Path = Utility.GetCorrectLinkFormat(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "producer").Descendants("a").First().GetAttributeValue("href", "")); } page.Studios.Add(prod); animeNode.Descendants("span").Where(x => x.GetAttributeValue("span", "") == "genre").ToList().ForEach(x => { page.Genres.Add(Utility.FixString(x.InnerText)); }); float.TryParse(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "score").InnerText, out float score); page.Members = Utility.GetIntFromString(animeNode.Descendants("span").First(x => x.GetAttributeValue("class", "") == "member fl-r").InnerText); string[] str = HtmlEntity.DeEntitize(animeNode.Descendants("div").First(x => x.GetAttributeValue("class", "") == "info").InnerText).Split('-'); page.Type = Utility.FixString(str[0].Replace(" ", "")); page.Aired = Utility.FixString(str[1].Split(',').Take(2).Aggregate((x, y) => x + ", " + y)); if (str[1].Split(',').Length > 2) { page.Broadcast = Utility.FixString(str[1].Split(',')[2]); } list.Animes.Add(page); } if (doc.DocumentNode.Descendants().Any(x => x.GetAttributeValue("class", "") == "mt12 mb12")) { int currOffset = Utility.GetIntFromString(doc.DocumentNode.Descendants().First(x => x.GetAttributeValue("class", "") == "link current").GetAttributeValue("href", "").Split('=').Last()); var links = doc.DocumentNode.Descendants().Where(x => x.GetAttributeValue("class", "") == "link"); int minOffset = Utility.GetIntFromString(links.First().GetAttributeValue("href", "").Split('=').Last()); int maxOffset = Utility.GetIntFromString(links.Last().GetAttributeValue("href", "").Split('=').Last()); string newLink = ""; if (link.Contains("?page=")) { newLink = link.Split('=')[0] + "="; } else { newLink = link + "?page="; } if (currOffset > minOffset) { list.PreviousPageLink = new LinkInfo(newLink + (currOffset - 1)); } if (currOffset < maxOffset) { list.NextPageLink = new LinkInfo(newLink + (currOffset + 1)); } } } catch (Exception ex) { Console.WriteLine(ex.Message + ex.StackTrace + link); } return(list); }
/// <summary> /// HTML Decodes a String so any character entities used are converted to their actual characters /// </summary> /// <param name="value">Value to decode</param> /// <returns></returns> public static String HtmlDecode(String value) { return(HtmlEntity.DeEntitize(value)); }
public static string GetDecodedInnerText(this HtmlNode htmlNode) => HtmlEntity.DeEntitize(htmlNode.InnerText);
private static void ConvertTo(HtmlNode node, TextWriter outText) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of white spaces if (html.Trim().Length > 0) { outText.Write(HtmlEntity.DeEntitize(html)); } break; case HtmlNodeType.Element: switch (node.Name) { case "p": // treat paragraphs as crlf outText.Write(Environment.NewLine); break; case "br": outText.Write(Environment.NewLine); break; case "a": HtmlAttribute att = node.Attributes["href"]; outText.Write($"<{att.Value}>"); break; } if (node.HasChildNodes) { ConvertContentTo(node, outText); } break; } }
internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText, textInfo); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Length == 0) { break; } if (!textInfo.FirstTextOfBlockWritten || textInfo.LastCharWasSpace) { html = html.TrimStart(); if (html.Length == 0) { break; } textInfo.FirstTextOfBlockWritten = true; } outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) { outText.Write(' '); } break; case HtmlNodeType.Element: string endElementString = null; bool isInline; switch (node.Name) { case "p": case "div": // stylistic - adjust as you tend to use if (textInfo.IsFirstElementOfDoc) { textInfo.IsFirstElementOfDoc = false; } else { outText.Write("\r\n"); } endElementString = "\r\n"; isInline = false; break; case "a": if (node.Attributes.Contains("href")) { string href = node.Attributes["href"].Value; if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase) == -1) { endElementString = "<" + href + ">"; } } isInline = true; break; case "li": //not doing ol li elements at this stage outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022 isInline = false; break; case "ul": endElementString = "\r\n"; isInline = false; break; case "img": //inline-block in reality, but KISS if (node.Attributes.Contains("alt")) { outText.Write('[' + node.Attributes["alt"].Value); endElementString = "]"; } if (node.Attributes.Contains("src")) { outText.Write('<' + node.Attributes["alt"].Value + '>'); } isInline = true; break; case "span": case "strong": case "em": isInline = true; break; default: isInline = false; break; } if (node.HasChildNodes) { ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo()); if (endElementString != null) { outText.Write(endElementString); } } break; } }
private static bool ConvertTo(HtmlNode node, TextWriter outText) { var result = false; switch (node.NodeType) { case HtmlNodeType.Document: result = ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output var parentName = node.ParentNode.Name; if (parentName == "script" || parentName == "style") { break; } // get text var html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } var text = HtmlEntity.DeEntitize(html.Replace("\r\n", " ").Replace("\n", " ").Trim()); if (string.IsNullOrEmpty(text)) { break; } result = true; outText.Write(text); break; case HtmlNodeType.Element: switch (node.Name) { case "br": outText.WriteLine(); return(false); case "hr": outText.WriteLine(new string('_', 32)); return(false); case "img": var alt = node.GetAttributeValue("alt", null).Trim(); if (!string.IsNullOrEmpty(alt)) { result = true; outText.Write($"[{alt}]"); } break; case "li": outText.Write("- "); break; } if (node.HasChildNodes) { result = ConvertContentTo(node, outText); } if (result) { switch (node.Name) { case "p": case "div": case "tr": case "li": outText.WriteLine(); break; case "a": var href = node.GetAttributeValue("href", null); if (!string.IsNullOrEmpty(href)) { outText.Write($"<{href}>"); } break; } } break; } return(result); }
private static void ConvertTo(HtmlNode node, TextWriter outText, PrecedingDomTextInfo textInfo) { string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText, textInfo); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Length == 0) { break; } if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace) { html = html.TrimStart(); if (html.Length == 0) { break; } textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true; } outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " "))); if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1])) { outText.Write(' '); } break; case HtmlNodeType.Element: string?endElementString = null; bool isInline; bool skip = false; int listIndex = 0; switch (node.Name) { case "nav": skip = true; isInline = false; break; case "body": case "section": case "article": case "aside": case "header": case "footer": case "address": case "main": case "div": case "p": // stylistic - adjust as you tend to use if (textInfo.IsFirstTextOfDocWritten) { outText.Write("\r\n"); } endElementString = "\r\n"; isInline = false; break; case "em": if (textInfo.IsFirstTextOfDocWritten) { outText.Write("_"); } endElementString = "_"; isInline = true; break; case "strong": if (textInfo.IsFirstTextOfDocWritten) { outText.Write("__"); } endElementString = "__"; isInline = true; break; case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": if (textInfo.IsFirstTextOfDocWritten) { outText.Write("\r\n"); int level = int.Parse(node.Name.Substring(1)); outText.Write(new string('#', level)); outText.Write(" "); } endElementString = "\r\n"; isInline = false; break; case "br": outText.Write("\r\n"); skip = true; textInfo.WritePrecedingWhiteSpace = false; isInline = true; break; case "a": if (node.Attributes.Contains("href")) { string href = node.Attributes["href"].Value?.Trim() ?? ""; if (!string.IsNullOrEmpty(href) && !href.StartsWith("data:")) { outText.Write("["); endElementString = $"]({href})"; } } isInline = true; break; case "code": if (textInfo.IsFirstTextOfDocWritten) { outText.Write("`"); } endElementString = "`"; isInline = true; break; case "li": if (textInfo.ListIndex > 0) { outText.Write("\r\n{0}.\t", textInfo.ListIndex++); } else { outText.Write( "\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022 } isInline = false; break; case "ol": listIndex = 1; goto case "ul"; case "ul" : //not handling nested lists any differently at this stage - that is getting close to rendering problems endElementString = "\r\n"; isInline = false; break; case "img": //inline-block in reality if (node.Attributes.Contains("alt")) { outText.Write('[' + node.Attributes["alt"].Value); endElementString = "]"; } if (node.Attributes.Contains("src")) { outText.Write('<' + node.Attributes["src"].Value + '>'); } isInline = true; break; default: isInline = true; break; } if (!skip && node.HasChildNodes) { ConvertContentTo(node, outText, isInline ? textInfo : new PrecedingDomTextInfo(textInfo.IsFirstTextOfDocWritten) { ListIndex = listIndex }); } if (endElementString != null) { outText.Write(endElementString); } break; } }
private void ConvertToText(HtmlNode node, TextWriter outText) { if (hasContentEnd) { return; } string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Trim().Length > 0) { outText.Write(HtmlEntity.DeEntitize(html)); } break; case HtmlNodeType.Element: bool isHeading = false, isList = false, isCode = false; switch (node.Name) { case "pre": isCode = true; outText.Write("\r\n^\r\n"); break; case "ol": case "ul": isList = true; outText.Write("\r\n⌐\r\n"); break; case "li": outText.Write("\r\n● "); break; case "div": outText.Write("\r\n"); if (hasH1 && !hasContentEnd) { var css = node.getAttribute("class"); if (css != null && css.Length > 0) { bool is_end_content = DIV_CLASS_END.Where(x => css.IndexOf(x) != -1).Count() > 0; if (is_end_content) { hasContentEnd = true; } } } break; case "p": outText.Write("\r\n"); break; case "h2": case "h3": case "h4": case "h5": case "h6": isHeading = true; outText.Write("\r\n■ "); break; case "h1": hasH1 = true; outText.Write("\r\n{H1}\r\n"); break; case "img": var src = node.getAttribute("src"); if (!string.IsNullOrEmpty(src)) { outText.Write("\r\n{IMG-" + src + "-IMG}\r\n"); } break; } if (node.HasChildNodes) { ConvertContentTo(node, outText); } if (isHeading) { outText.Write("\r\n"); } if (isList) { outText.Write("\r\n┘\r\n"); } if (isCode) { outText.Write("\r\nⱽ\r\n"); } break; } }
private static void Plain(StringBuilder builder, ref ToPlainTextState state, IEnumerable <HtmlNode> nodes) { foreach (HtmlNode node in nodes) { if (node is HtmlTextNode) { HtmlTextNode text = (HtmlTextNode)node; char[] chars = HtmlEntity.DeEntitize(text.Text).ToCharArray(); foreach (char ch in chars) { if (char.IsWhiteSpace(ch)) { if (ch == 0xA0 || ch == 0x2007 || ch == 0x202F) { if (state == ToPlainTextState.WhiteSpace) { builder.Append(' '); } builder.Append(' '); state = ToPlainTextState.NotWhiteSpace; } else { if (state == ToPlainTextState.NotWhiteSpace) { state = ToPlainTextState.WhiteSpace; } } } else { if (state == ToPlainTextState.WhiteSpace) { builder.Append(' '); } builder.Append(ch); state = ToPlainTextState.NotWhiteSpace; } } } else { string tag = node.Name.ToLower(); if (tag == "br") { builder.AppendLine(); state = ToPlainTextState.StartLine; } else if (NonVisibleTags.Contains(tag)) { } else if (InlineTags.Contains(tag)) { Plain(builder, ref state, node.ChildNodes); } else { if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } Plain(builder, ref state, node.ChildNodes); if (state != ToPlainTextState.StartLine) { builder.AppendLine(); state = ToPlainTextState.StartLine; } } } } }
public void ConvertTo(HtmlNode node, TextWriter outText) { if (Filters != null) { if (Filters.Select(x => x.TrimStart('#')).Contains(node.Id.Trim())) { return; } if (node.Attributes.Contains("class") && Filters.Select(x => x.TrimStart('.')).Contains(node.Attributes["class"].Value.Trim())) { return; } } string html; switch (node.NodeType) { case HtmlNodeType.Comment: // don't output comments break; case HtmlNodeType.Document: ConvertContentTo(node, outText); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if ((parentName == "script") || (parentName == "style")) { break; } // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) { break; } // check the text is meaningful and not a bunch of whitespaces if (html.Trim().Length > 0) { foreach (var c in trimChars) { html = html.Replace(c, ' '); } outText.Write(HtmlEntity.DeEntitize(html).Trim(trimChars)); } break; case HtmlNodeType.Element: bool skip = false; switch (node.Name.ToLower()) { case "title": if (node.HasChildNodes) { ConvertContentTo(node, outText); } skip = true; break; case "meta": //extract description if (node.GetAttributeValue("name", "") == "description") { outText.Write(Environment.NewLine); outText.Write(node.GetAttributeValue("content", "")); outText.Write(Environment.NewLine); } break; //handle headers case "h1": case "h2": case "h3": outText.Write(Environment.NewLine); outText.Write(Environment.NewLine); if (node.HasChildNodes) { ConvertContentTo(node, outText); } outText.Write(Environment.NewLine); skip = true; break; case "p": case "ul": case "ol": case "div": case "br": // treat paragraphs as crlf outText.Write(Environment.NewLine); break; case "li": outText.Write(Environment.NewLine + "* "); break; case "img": outText.Write(Environment.NewLine + imageLinkTextHighlight + "[img:" + node.Attributes["alt"]?.Value + "]" + resetColor); if (node.Attributes.Contains("src")) { var uriName = node.Attributes["src"].Value; Uri uriResult; if (Uri.TryCreate(uriName, UriKind.Absolute, out uriResult) && (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps)) { Images.Add(uriResult); outText.Write(imageLinkHighlight + "[" + Images.Count + "] " + resetColor); } else if (Uri.TryCreate(BaseUri, uriName, out uriResult) && (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps)) { Images.Add(uriResult); outText.Write(imageLinkHighlight + "[" + Images.Count + "] " + resetColor); } } break; case "strong": outText.Write(" " + linkTextHighlight); if (node.HasChildNodes) { ConvertContentTo(node, outText); } outText.Write(resetColor + " "); skip = true; break; case "a": outText.Write(linkTextHighlight + " [Link:"); if (node.HasChildNodes) { ConvertContentTo(node, outText); } outText.Write("]" + resetColor); if (node.Attributes.Contains("href")) { var uriName = node.Attributes["href"].Value; Uri uriResult; if (Uri.TryCreate(uriName, UriKind.Absolute, out uriResult) && (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps)) { Links.Add(uriResult); outText.Write(linkHighlight + "[" + (Links.Count + LinkStartFrom) + "] " + resetColor); } else if (Uri.TryCreate(BaseUri, uriName, out uriResult) && (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps)) { Links.Add(uriResult); outText.Write(linkHighlight + "[" + (Links.Count + LinkStartFrom) + "] " + resetColor); } } skip = true; break; case "i": outText.Write(" "); if (node.HasChildNodes) { ConvertContentTo(node, outText); } outText.Write(" "); skip = true; break; } if (!skip) { if (node.HasChildNodes) { ConvertContentTo(node, outText); } } break; } }