private static List <Resource> FindResources(AngleSharp.Dom.Html.IHtmlDocument htmlDoc) { var resources = new List <Resource>(); foreach (var tb in htmlDoc.QuerySelectorAll("table")) { if (tb.QuerySelector("anchor") != null) { resources.Add(new Resource() { Description = tb.QuerySelector("anchor").TextContent, Link = tb.QuerySelector("anchor").GetAttribute("pkqdhpef") }); } else if (tb.QuerySelector("a") != null && !tb.QuerySelector("a").GetAttribute("href").Contains("html")) { var a = tb.QuerySelector("a"); resources.Add(new Resource() { Description = !string.IsNullOrEmpty(a.GetAttribute("title")) ? a.GetAttribute("title") : a.TextContent, Link = a.TextContent }); } } return(resources); }
public static string ExtractRegisteredCompanyName(AngleSharp.Dom.Html.IHtmlDocument doc) { var tableSelector = doc.QuerySelectorAll("*"); const int ELEMENT_SEARCH_LIMIT = 200; string pattern = "^commission file n"; int landmarkIndex = findByRegex(tableSelector, 0, pattern, RegexOptions.IgnoreCase); if (landmarkIndex > ELEMENT_SEARCH_LIMIT || landmarkIndex < 0) { Console.WriteLine("Cannot find registered company landmark in first " + ELEMENT_SEARCH_LIMIT + " elements of html doc"); return(""); } int postLandmarkIndex = skipPastRegex(tableSelector, landmarkIndex, pattern, RegexOptions.IgnoreCase); // Sometimes there's _____ or file number, etc. in between the landmark and the company name. // Skip past any elements that doesn't have at least two alphabetic characters in its text. postLandmarkIndex = skipPastRegex(tableSelector, postLandmarkIndex, "[^A-Za-z]{2}", RegexOptions.IgnoreCase); for (int i = postLandmarkIndex; postLandmarkIndex < ELEMENT_SEARCH_LIMIT; ++i) { var element = tableSelector[i]; if (element.TextContent.Trim().Length > 0) { return(element.TextContent.Trim()); } } return(""); }
public static void ExtractTableFromHTML(AngleSharp.Dom.Html.IHtmlDocument doc, IList <string> additionalLandmarks, string statementTitle, int statementInstanceIndex, string outputPath, IDictionary <string, string> rowHeadOverrides, IDictionary <string, string> config) { // Select all the DOM's elements var tableSelector = doc.QuerySelectorAll("*"); int lastLandmarkIndex = 0; if (additionalLandmarks.Count > 0) { // Find the sequential set of landmarks that skip past any undesired occurrences of the statement title before the statement itself (ie in the TOC) lastLandmarkIndex = findLandmarks(tableSelector, 0, additionalLandmarks); // Skip past the last landmark found lastLandmarkIndex = skipPastLandmark(tableSelector, lastLandmarkIndex, additionalLandmarks[additionalLandmarks.Count - 1]); } // Find the actual statement title landmark lastLandmarkIndex = findLandmark(tableSelector, lastLandmarkIndex, statementTitle); // See if that landmark is contained by a table (assumed, then, to be the statement table) int statementTableIndex = findContainingElementByType(tableSelector, lastLandmarkIndex, "table"); if (statementTableIndex == -1) { // Landmark is not contained in a table, so statement table is assumed to be first table following the landmark. statementTableIndex = findFollowingElementByType(tableSelector, lastLandmarkIndex, "table"); } if (statementTableIndex == -1) { Console.WriteLine("No landmarked table found"); return; } ExtractTableFromHTML(tableSelector, statementTableIndex, outputPath, rowHeadOverrides, config); }
private static void GetDataOnPageDoc(dynamic confInfo, AngleSharp.Dom.Html.IHtmlDocument page) { foreach (var element in page.QuerySelectorAll("li").Where(element => element.HasAttribute("logr"))) { var houseTitle = element.QuerySelector("h2").TextContent; var houseInfoList = houseTitle.Split(' '); int.TryParse(element.QuerySelector("b").TextContent, out var housePrice); var onlineUrl = $"http://{confInfo.shortcutname.Value}.58.com" + element.QuerySelector("a").GetAttribute("href"); if (DataContent.ApartmentHouseInfos.Find(onlineUrl) != null) { continue; } var houseInfo = new ApartmentHouseInfo { HouseTitle = houseTitle, HouseOnlineURL = onlineUrl, DisPlayPrice = element.QuerySelector("b").TextContent, HouseLocation = new[] { "公寓", "青年社区" }.All(s => houseInfoList.Contains(s)) ? houseInfoList[0] : houseInfoList[1], DataCreateTime = DateTime.Now, Source = ConstConfigurationName.PinPaiGongYu, HousePrice = housePrice, HouseText = houseTitle, LocationCityName = confInfo.cityname.Value, PubTime = DateTime.Now }; DataContent.ApartmentHouseInfos.Add(houseInfo); } }
private void PostSearch(IElement item, List <string> dict) { GeneralPost tweet = new GeneralPost(); var h = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("AdaptiveMediaOuterContainer")); if (!(h.Count() == 0)) { if (h.First().QuerySelectorAll("img").Count() != 0) { tweet.Image = h.First().QuerySelectorAll("img").First().Attributes["src"].Value; } } long id = long.Parse(item.Attributes["data-item-id"].Value); tweet.Text = item.QuerySelectorAll("p").Where(k => k.ClassName.Contains("tweet-text")).First().InnerHtml; Cenzor cenzor = new Cenzor(); tweet.Text = cenzor.Cenz(tweet.Text, dict); tweet.Social = SocialMedia.Twitter; tweet.AuthorName = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("tweet")).First().Attributes["data-name"].Value; string linkname = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("tweet")).First().Attributes["data-screen-name"].Value; tweet.PostLink = "https://twitter.com/" + linkname + "/status/" + id; tweet.AuthorLink = "https://twitter.com/" + linkname; tweet.AuthorAvatar = item.QuerySelectorAll("img").Where(y => y.ClassName.Contains("avatar")).First().Attributes["src"].Value; try { var elemwithdate = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("content")).First().QuerySelectorAll("div").Where(o => o.ClassName.Contains("stream-item-header")).First().QuerySelectorAll("small").Where(u => u.ClassName.Contains("time")).First().QuerySelectorAll("a").Where(f => f.ClassName.Contains("tweet-timestamp")).First().Attributes["title"].Value; var massivstrdate = elemwithdate.Split('-'); var massivyearmohtn = massivstrdate[1].Split(' '); var h1 = massivstrdate[0].TrimEnd(' '); var h2 = massivyearmohtn[1]; var h3 = massivyearmohtn[2]; var h4 = massivyearmohtn[3]; var d = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("content")).First().QuerySelectorAll("div").Where(o => o.ClassName.Contains("stream-item-header")).First().QuerySelectorAll("small").Where(u => u.ClassName.Contains("time")).First().QuerySelectorAll("a").Where(f => f.ClassName.Contains("tweet-timestamp")).First().QuerySelectorAll("span").Where(p => p.ClassName.Contains("_timestamp")).First().Attributes["data-time-ms"].Value; var s1 = h1.Split(':'); tweet.Date = (new DateTime(Int32.Parse(h4), getMonth(h3), Int32.Parse(h2), Int32.Parse(s1[0]), Int32.Parse(s1[1]), 0)); } catch { } AngleSharp.Parser.Html.HtmlParser parser = new AngleSharp.Parser.Html.HtmlParser(); AngleSharp.Dom.Html.IHtmlDocument htmldocument = parser.Parse(tweet.Text); var links = htmldocument.QuerySelectorAll("a"); foreach (var link in links) { tweet.Text = tweet.Text.Replace(link.OuterHtml, link.InnerHtml); } lock (posts) { posts.Add(tweet); } }
private static void AddImageClassResponsive(AngleSharp.Dom.Html.IHtmlDocument document) { var imgs = document.QuerySelectorAll("img"); foreach (var img in imgs) { img.ClassList.Add("img-responsive"); } }
static public async void PageParsing(int PageNumber) { HtmlParser parser = new HtmlParser(); AngleSharp.Dom.Html.IHtmlDocument document = await parser.ParseAsync(DownloadPage("https://e-dostavka.by/recipe/hot/" + PageNumber + ".html")); blydo.BlydoName = document.QuerySelector("h1").TextContent; //название блюда blydo.BlydoPicture = document.QuerySelector("img.retina_redy").GetAttribute("src"); //картинка blydo.IngName = document.QuerySelectorAll("li.not_in_cart a"); blydo.Count = document.QuerySelectorAll("li.not_in_cart span"); blydo.BlydoSP_Picture = document.QuerySelectorAll("a.fancy_img"); blydo.BlydoSposobPrigotovleniya = document.QuerySelectorAll("a.fancy_img"); }
private void ReplaceElementWithFragmentNodes(AngleSharp.Dom.Html.IHtmlDocument document, IElement inlineContentItemElement, string contentItemCodename, object inlineContentItem, string fragmentText) { try { var fragmentNodes = _strictHtmlParser.ParseFragment(fragmentText, inlineContentItemElement.ParentElement); inlineContentItemElement.Replace(fragmentNodes.ToArray()); } catch (HtmlParseException exception) { var errorNode = document.CreateTextNode($"[Inline content item resolver provided an invalid HTML 5 fragment ({exception.Position.Line}:{exception.Position.Column}). Please check the output for a content item {contentItemCodename} of type {GetInlineContentItemType(inlineContentItem)}.]"); inlineContentItemElement.Replace(errorNode); } }
public static string ExtractFormType(AngleSharp.Dom.Html.IHtmlDocument doc) { var tableSelector = doc.QuerySelectorAll("*"); const int ELEMENT_SEARCH_LIMIT = 200; for (int iElement = 0; iElement < ELEMENT_SEARCH_LIMIT; ++iElement) { var element = tableSelector[iElement]; Match m = Regex.Match(element.TextContent.Trim().ToUpper(), @"FORM\s+(10\s?-\s?[KQ])"); if (m.Success) { string capture = m.Groups[1].Captures[0].Value; return(EliminateWhitespace(capture)); } } return(""); // Not found }
private static List <Resource> FindResources(AngleSharp.Dom.Html.IHtmlDocument htmlDoc) { var resources = new List <Resource>(); foreach (var li in htmlDoc.QuerySelectorAll("div.p_list").SelectMany(l => l.QuerySelectorAll("li"))) { if (li.QuerySelector("a") != null && li.QuerySelector("span") != null) { resources.Add(new Resource() { Description = li.QuerySelector("a").GetAttribute("title"), Link = li.QuerySelector("a.d1").GetAttribute("href"), }); } } return(resources); }
protected Dictionary <string, string> GetCSRFParams(AngleSharp.Dom.Html.IHtmlDocument doc) { var csrfParam = doc.QuerySelectorAll("meta[name='csrf-param']").FirstOrDefault().GetAttribute("content"); Log.Verbose("Found CSRF Param: " + csrfParam); var csrfToken = doc.QuerySelectorAll("meta[name='csrf-token']").FirstOrDefault().GetAttribute("content"); Log.Verbose("Found CSRF Token: " + csrfToken); var values = new Dictionary <string, string> { { "utf8", "✓" }, { csrfParam, csrfToken } }; return(values); }
public List <GeneralPost> GetTweets(string html, List <string> dict) { AngleSharp.Parser.Html.HtmlParser parser = new AngleSharp.Parser.Html.HtmlParser(); AngleSharp.Dom.Html.IHtmlDocument htmldocument = parser.Parse(html); var list = new List <string>(); var items = htmldocument.QuerySelectorAll("li").Where(item => item.ClassName != null && item.ClassName.Contains("stream-item") && !item.ClassName.Contains("AdaptiveStream")); List <Thread> postThreads = new List <Thread>(); foreach (var item in items) { Thread postThread = new Thread(() => PostSearch(item, dict)); postThreads.Add(postThread); postThread.Start(); } postThreads.ForEach(t => t.Join()); return(posts); }
static void ExtractFromFiling(string sourceFilePath) { Console.WriteLine("Extracting from " + sourceFilePath); // Read and parse the filing html file AngleSharp.Dom.Html.IHtmlDocument htmlDoc = null; try { htmlDoc = ReadAndParseHtmlFile(sourceFilePath); } catch (Exception e) { Console.WriteLine("Exception reading " + sourceFilePath + ":"); Console.WriteLine(e); return; } if (htmlDoc == null) { Console.WriteLine("No data parsing " + sourceFilePath + ":"); return; } // Extract the Registered Company Name from the filing. string registeredCompanyName = ExtractRegisteredCompanyName(htmlDoc); // Extract the filing type from the filing. string formType = ExtractFormType(htmlDoc); Console.WriteLine("Extracting Tables from Form " + formType + " for " + registeredCompanyName); string companyConfigurationFileName = @"config\" + registeredCompanyName + ".json"; if (registeredCompanyName.Length > 0 && formType.Length > 0 && File.Exists(companyConfigurationFileName)) { // Use configuration to guide export ExtractFromFilingUsingConfiguration(htmlDoc, companyConfigurationFileName); } else { // Use a more generalized approach ExtractFromFilingWithoutConfiguration(htmlDoc, registeredCompanyName, formType); } }
private static void LiTagsCheck(AngleSharp.Dom.Html.IHtmlDocument doc) { var firstLi = doc.All.FirstOrDefault(x => x.LocalName == "li" && (x.ParentElement == null || (x.ParentElement.LocalName != "ul" && x.ParentElement.LocalName != "ol"))); while (firstLi != null) { var ul = doc.CreateElement("ul"); var parent = firstLi.ParentElement; if (parent.HasChildNodes) { var allChild = parent.Children.Where(x => x.LocalName == "li"); foreach (var item in allChild) { parent.RemoveChild(item); ul.AppendChild(item); } parent.AppendChild(ul); firstLi = doc.All.FirstOrDefault(x => x.LocalName == "li" && (x.ParentElement == null || (x.ParentElement.LocalName != "ul" && x.ParentElement.LocalName != "ol"))); } } }
private static List <IElement> GetInlineContentItemElements(AngleSharp.Dom.Html.IHtmlDocument htmlInput) => htmlInput .Body .GetElementsByTagName("object") .Where(o => o.GetAttribute("type") == "application/kenticocloud" && o.GetAttribute("data-type") == "item") .ToList();
private void crawler_CrawlerComplete(object sender, PageCrawlCompletedArgs e) { CrawledPage page = e.CrawledPage; if (page.WebException != null || page.HttpWebResponse.StatusCode != System.Net.HttpStatusCode.OK) { Console.WriteLine("## Error on {0}", page.Uri.ToString()); Console.WriteLine(); AddToCrawledPages(page.Uri.ToString()); } else { Console.WriteLine("Crawl OK: {0}", page.Uri.ToString()); string currentURL = page.Uri.ToString(); AddToCrawledPages(currentURL); if (IncludeMetaData) { Console.Write("Extracting meta: "); // identiyfying non page link if (IsPageUrl(currentURL)) { AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument; // output Models.MetaData metaDataDTO = new Models.MetaData(); metaDataDTO.Url = currentURL; // options string titleSelector = "title"; string metaSelector = "meta"; string[] metaAttributes = new[] { "description", "keywords" }; // title AngleSharp.Dom.IElement titleElement = htmlPage.QuerySelector(titleSelector); metaDataDTO.Title = (titleElement != null) ? titleElement.TextContent : ""; Console.Write("title "); // description AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> metaElements = htmlPage.QuerySelectorAll(metaSelector); foreach (var item in metaElements) { if (item.HasAttribute("name")) { if (item.Attributes["name"].Value == metaAttributes[0]) { try { metaDataDTO.Description = item.Attributes["content"].Value; } catch (Exception) { throw; } Console.Write(metaAttributes[0] + " "); } if (item.Attributes["name"].Value == metaAttributes[1]) { try { metaDataDTO.Keywords = item.Attributes["content"].Value; } catch (Exception) { throw; } Console.Write(metaAttributes[1] + " "); } } } // add MetaData.Add(metaDataDTO); // output to console Console.WriteLine(" OK"); } else { Console.WriteLine(" NO VALID PAGE"); } } if (ExtractLinks) { AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument; // get all Links in class selector // generate selector // add prefix if it is set string selector = "a"; if (!String.IsNullOrWhiteSpace(ClassSelector)) { selector = ClassSelector + " " + selector; } // all links AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> links = htmlPage.QuerySelectorAll(selector); // extract links List <string> linksFound = links.Select((x) => { if (x.HasAttribute("href")) { string linkValue = x.Attributes["href"].Value; // dismiss non valid values if (linkValue.Contains("javascript:")) { return(null); } if (linkValue.Contains("mailto:")) { return(null); } if (linkValue.Contains("tel:")) { return(null); } // base uri string baseUri = page.Uri.GetLeftPart(UriPartial.Authority); if (!linkValue.StartsWith("http")) { linkValue = baseUri + "/" + linkValue; } return(linkValue); } return(null); } ) .Where(x => !string.IsNullOrWhiteSpace(x)) .ToList(); // store in dictionary this.PagesCrawledLinks.Add(currentURL, linksFound); } // new line Console.WriteLine(); } }
static void ExtractFromFilingWithoutConfiguration(AngleSharp.Dom.Html.IHtmlDocument htmlDoc, string registeredCompanyName, string formName) { // Select all the DOM's elements var tableSelector = htmlDoc.QuerySelectorAll("*"); // See if there's a TOC int iElement = findByRegex(tableSelector, 0, @"^united states", RegexOptions.IgnoreCase, 1000); iElement = findByRegex(tableSelector, iElement, @"^table of contents|^index", RegexOptions.IgnoreCase, 1000); string notesSectionTitleRegexPattern = @"^(?:Condensed )?notes to .*consolidated (?:condensed )?financial statements|^Supplemental Financial Data"; string notesSectionTitle = ""; IDictionary <string, string> statementHrefs = new Dictionary <string, string>(); if (iElement > 0) { Console.WriteLine("Processing TOC"); // Get key landmarks from the TOC. TOC is bounded by the HR that follows it. int lastElementToConsider = iElement + 1000; // Limit the search for (; iElement < lastElementToConsider && tableSelector[iElement].TagName.ToLower() != "hr"; ++iElement) { var element = tableSelector[iElement]; if (element.TagName.ToLower() == "a" && Regex.IsMatch(element.TextContent, "statement", RegexOptions.IgnoreCase)) { string href = element.GetAttribute("href"); statementHrefs.Add(href.Replace("#", ""), element.TextContent.Trim()); iElement = skipPastLandmark(tableSelector, iElement, element.TextContent); } if (element.TagName.ToLower() == "a" && Regex.IsMatch(element.TextContent, "balance", RegexOptions.IgnoreCase)) { string href = element.GetAttribute("href"); statementHrefs.Add(href.Replace("#", ""), element.TextContent.Trim()); iElement = skipPastLandmark(tableSelector, iElement, element.TextContent); } if (Regex.IsMatch(element.TextContent, notesSectionTitleRegexPattern, RegexOptions.IgnoreCase)) { // If we find the notes section title in the TOC we use it explicitly for the search below notesSectionTitle = StandardizeWhitespace(element.TextContent.Trim()); iElement = skipPastRegex(tableSelector, iElement, notesSectionTitleRegexPattern, RegexOptions.IgnoreCase); } } } else { Console.WriteLine("No TOC found. Skipping (for now)."); return; } foreach (var entry in statementHrefs) { Console.WriteLine(entry.Value + " @ " + entry.Key); } foreach (var entry in statementHrefs) { string outputFileDirectory = @"c:\temp\10QParseOutput\"; // MAKE ME CONFIGURABLE string outputFileName = outputFileDirectory + registeredCompanyName + "_" + formName + "_" + entry.Value + "_" + entry.Key + ".txt"; IDictionary <string, string> rowHeadOverrideDict = new Dictionary <string, string>(); IDictionary <string, string> parametersDict = new Dictionary <string, string>(); ExtractTableFromHTML(tableSelector, entry.Key, outputFileName, rowHeadOverrideDict, parametersDict); } Console.WriteLine(); Console.ReadLine(); }
static void InputCheck(AngleSharp.Dom.Html.IHtmlDocument doc) { LiTagsCheck(doc); var input = doc.QuerySelectorAll("input"); foreach (var item in input) { if (!item.ClassList.Contains("ignore")) { item.ClassList.Add("ignore"); } } var scripts = doc.Body.Children.Where(x => x.LocalName == "script" && x.ParentElement.LocalName != "code"); foreach (var item in scripts) { var parent = item.ParentElement; parent.RemoveChild(item); var code = doc.CreateElement("code"); code.AppendChild(item); parent.AppendChild(code); } var scritps = doc.Scripts.ToList(); foreach (var item in scritps) { var pre = doc.CreateElement("pre"); pre.ClassName = "code code-javascript"; var code = doc.CreateElement("code"); code.InnerHtml = HttpUtility.HtmlEncode(item.OuterHtml); pre.AppendChild(code); item.Remove(); doc.Body.AppendChild(pre); } var codes = doc.Body.Children.Where(x => x.LocalName == "code" && x.ParentElement.LocalName != "pre"); foreach (var item in codes) { var parent = item.ParentElement; parent.RemoveChild(item); var pre = doc.CreateElement("pre"); pre.ClassName = "code code-javascript"; pre.AppendChild(item); parent.AppendChild(pre); } var pres = doc.Body.Children.Where(x => x.LocalName == "pre"); foreach (var item in pres) { if (!item.Children.Any(x => x.LocalName == "code")) { var code = doc.CreateElement("code"); foreach (var c in item.Children) { c.Remove(); code.AppendChild(c); } item.AppendChild(code); } if (!item.ClassList.Contains("code")) { item.ClassList.Add("code"); } if (item.TextContent.Contains("script")) { item.ClassList.Add("code-javascript"); } else if (item.TextContent.Contains("style")) { item.ClassList.Add("code-css"); } } var empty = doc.Body.QuerySelectorAll(":empty"); foreach (var item in empty) { if (!"img video".Contains(item.LocalName)) { item.Remove(); } else if (item.LocalName == "img") { var src = item.GetAttribute("src"); if (src.Contains(";base64,")) { string filePath = System.IO.Path.Combine(HostingEnvironment.ApplicationPhysicalPath, "/data/img/upload"); var str = src.WriteImageString(filePath); if (str.Success) { item.SetAttribute("src", str.FileName); } } } } }
public static UIBase[] ParseAttributesForUI(string html) { var options = Configuration.Default.WithCss(); var parser = new HtmlParser(options); AngleSharp.Dom.Html.IHtmlDocument doc = null; try { doc = parser.Parse(html); } catch (Exception) { return(null); } List <UIBase> attributes = new List <UIBase>(); List <UIBase> attributesNotIn = new List <UIBase>(); int[] layout = { 0, 0, 0, 0 }; // Get all the elements as a big flat list in list attributes. foreach (var item in doc.Body.Children) { ParseAttributesForUI(attributes, attributesNotIn, item, layout); } // try to find again foreach (var spec in attributesNotIn) { foreach (var item in attributes) { var frame = item as UIFrame; if (frame != null) { if (spec.IsIn(frame)) { frame.AddChild(spec); } } } } // Group now all the root franes, List <UIFrame> rootframes = new List <UIFrame>(); foreach (var candidate in attributes) { UIFrame frame_candidate = candidate as UIFrame; if (frame_candidate == null) { continue; } foreach (var attribute in attributes) { UIFrame frame = attribute as UIFrame; if (frame == null) { continue; // not a frame so can not be root. } if (attribute == frame_candidate) { continue; // Is same object no need to test } if (frame_candidate.IsIn(frame)) // Test if the frame_candidate is part of { // this frame. If it is it can not be a root frame. frame_candidate = null; break; } } if (frame_candidate != null) { // Is a root frame rootframes.Add(frame_candidate); } } // Now we have all the root frames. // remove thses frames from the attributes list foreach (var frame in rootframes) { attributes.Remove(frame); } // Now group all the remaining elements in the rootframes. UIBase elm = attributes.FirstOrDefault(); while (elm != null) { foreach (var attribute in rootframes) { UIFrame frame = attribute as UIFrame; // Could also be an none frame element as root. if (frame == null) { continue; } if (elm.IsIn(frame)) { frame.AddChild(elm); attributes.Remove(elm); elm = null; break; } } if (elm != null) // is a none frame that is a root. { rootframes.Add((UIFrame)elm); attributes.Remove(elm); } elm = attributes.FirstOrDefault(); } UIFrame noframe = null; List <UIFrame> toremove = new List <UIFrame>(); foreach (var item in rootframes) { UIFrame frame = item as UIFrame; if (frame.Children == null || frame.Children.Count() == 0) { toremove.Add(item); } //rootframes.Remove(item); if (frame.NestedName == "NO_FRAME") { noframe = frame; } } foreach (var item in toremove) { rootframes.Remove(item); } //rootframes.Remove(noframe); //rootframes.Add(noframe); //rootframes = rootframes.OrderBy(x => x.y).ToList(); return(rootframes.OrderBy(x => x.y).ToArray()); //return rootframes.ToArray(); }
internal AngleSharpDocument(AngleSharp.Dom.Html.IHtmlDocument Document) { this.Document = Document; DocumentNode = new AngleSharpNode(Document.DocumentElement); }
static void ExtractFromFilingUsingConfiguration(AngleSharp.Dom.Html.IHtmlDocument htmlDoc, string companyConfigurationFileName) { string companyConfigJson; try { companyConfigJson = File.ReadAllText(companyConfigurationFileName); } catch (Exception e) { Console.WriteLine("Exception on read of company configuration file " + companyConfigurationFileName); Console.WriteLine(e); return; } // Parse the company configuration JSON file and extract the salient details JObject config = JObject.Parse(companyConfigJson); var registeredName = getStringValueFromJObject(config, "RegisteredName"); var ticker = getStringValueFromJObject(config, "Ticker"); var cik = getStringValueFromJObject(config, "CIK"); JObject jFilings = (JObject)config["Filings"]; JObject tenQfiling = (JObject)jFilings["10-Q"]; JArray filingTables = (JArray)tenQfiling["Tables"]; foreach (var filingTable in filingTables.Values <JObject>()) { string statementTitle = getStringValueFromJObject(filingTable, "StatementTitle"); int titleOccurrence = (filingTable["TitleOccurence"] != null) ? ((JProperty)filingTable["TitleOccurence"]).Value <int>() : 1; JArray jLandmarks = (JArray)filingTable["AdditionalLandmarks"]; List <string> additionalLandmarks = new List <string>(); if (jLandmarks != null) { foreach (var jLandmark in jLandmarks) { additionalLandmarks.Add(((JValue)jLandmark).ToString()); } } IDictionary <string, string> rowHeadOverrideDict = new Dictionary <string, string>(); IDictionary <string, string> parametersDict = new Dictionary <string, string>(); JObject jOptions = (JObject)filingTable["Options"]; if (jOptions != null) { JArray jRowHeadOverides = (JArray)jOptions["RowHeadOverrides"]; foreach (var jRowHeadOverride in jRowHeadOverides.Children <JObject>()) { var k = jRowHeadOverride.Properties().First <JProperty>().Name; var v = jRowHeadOverride.Properties().First <JProperty>().Value; rowHeadOverrideDict.Add(k, v.ToString()); } JArray jParameters = (JArray)jOptions["Parameters"]; foreach (var jParameter in jParameters.Children <JObject>()) { var k = jParameter.Properties().First <JProperty>().Name; var v = jParameter.Properties().First <JProperty>().Value; parametersDict.Add(k, v.ToString()); } } // Extract the current table string outputFileDirectory = @"c:\temp\10QParseOutput\"; // MAKE ME CONFIGURABLE string outputFileName = outputFileDirectory + ticker + "_" + statementTitle + "_" + titleOccurrence + ".txt"; Console.WriteLine("Extracting table " + statementTitle + " to " + outputFileName); ExtractTableFromHTML(htmlDoc, additionalLandmarks, statementTitle, titleOccurrence, outputFileName, rowHeadOverrideDict, parametersDict); } }
private List <string> Parse(BotData data) { var original = ReplaceValues(parseTarget, data); var partial = original; var list = new List <string>(); // Parse the value switch (Type) { case ParseType.LR: var ls = ReplaceValues(leftString, data); var rs = ReplaceValues(rightString, data); var pFrom = 0; var pTo = 0; // No L and R = return full input if (ls == "" && rs == "") { list.Add(original); break; } // L or R not present and not empty else if (((!partial.Contains(ls) && ls != "") || (!partial.Contains(rs) && rs != ""))) { break; } // Instead of the mess below, we could simply use Extreme.NET's Substring extensions // return original.Substrings(ls, rs); // Recursive // return original.Substring(ls, rs); // Not recursive if (recursive) { if (useRegexLR) { try { var pattern = BuildLRPattern(ls, rs); MatchCollection mc = Regex.Matches(partial, pattern); foreach (Match m in mc) { list.Add(m.Value); } } catch { } } else { try { while ((partial.Contains(ls) || ls == "") && (partial.Contains(rs) || rs == "")) { // Search for left delimiter and Calculate offset pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length; // Move right of offset partial = partial.Substring(pFrom); // Search for right delimiter and Calculate length to parse pTo = rs == "" ? (partial.Length - 1) : partial.IndexOf(rs); // Parse it var parsed = partial.Substring(0, pTo); list.Add(parsed); // Move right of parsed + right partial = partial.Substring(parsed.Length + rs.Length); } } catch { } } } // Non-recursive else { if (useRegexLR) { var pattern = BuildLRPattern(ls, rs); MatchCollection mc = Regex.Matches(partial, pattern); if (mc.Count > 0) { list.Add(mc[0].Value); } } else { try { pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length; partial = partial.Substring(pFrom); pTo = rs == "" ? partial.Length : partial.IndexOf(rs); list.Add(partial.Substring(0, pTo)); } catch { } } } break; case ParseType.CSS: HtmlParser parser = new HtmlParser(); AngleSharp.Dom.Html.IHtmlDocument document = null; try { document = parser.Parse(original); } catch { } try { if (recursive) { foreach (var element in document.QuerySelectorAll(ReplaceValues(cssSelector, data))) { switch (ReplaceValues(attributeName, data)) { case "innerHTML": list.Add(element.InnerHtml); break; case "outerHTML": list.Add(element.OuterHtml); break; default: foreach (var attr in element.Attributes) { if (attr.Name == ReplaceValues(attributeName, data)) { list.Add(attr.Value); break; } } break; } } } else { switch (ReplaceValues(attributeName, data)) { case "innerHTML": list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].InnerHtml); break; case "outerHTML": list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].OuterHtml); break; default: foreach (var attr in document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].Attributes) { if (attr.Name == ReplaceValues(attributeName, data)) { list.Add(attr.Value); break; } } break; } } } catch { } break; case ParseType.JSON: if (JTokenParsing) { JObject json = JObject.Parse(original); var jsonlist = json.SelectTokens(jsonField, false); foreach (var j in jsonlist) { list.Add(j.ToString()); } } else { var jsonlist = new List <KeyValuePair <string, string> >(); parseJSON("", original, jsonlist); foreach (var j in jsonlist) { if (j.Key == ReplaceValues(jsonField, data)) { list.Add(j.Value); } } } break; case ParseType.XPATH: // NOT IMPLEMENTED YET break; case ParseType.REGEX: try { var matches = Regex.Matches(partial, ReplaceValues(regexString, data)); foreach (Match match in matches) { var output = ReplaceValues(regexOutput, data); for (var i = 0; i < match.Groups.Count; i++) { output = output.Replace("[" + i + "]", match.Groups[i].Value); } list.Add(output); } } catch { } break; } return(list); }