示例#1
0
        private static List <Resource> FindResources(AngleSharp.Dom.Html.IHtmlDocument htmlDoc)
        {
            var resources = new List <Resource>();

            foreach (var tb in htmlDoc.QuerySelectorAll("table"))
            {
                if (tb.QuerySelector("anchor") != null)
                {
                    resources.Add(new Resource()
                    {
                        Description = tb.QuerySelector("anchor").TextContent,
                        Link        = tb.QuerySelector("anchor").GetAttribute("pkqdhpef")
                    });
                }
                else if (tb.QuerySelector("a") != null && !tb.QuerySelector("a").GetAttribute("href").Contains("html"))
                {
                    var a = tb.QuerySelector("a");
                    resources.Add(new Resource()
                    {
                        Description = !string.IsNullOrEmpty(a.GetAttribute("title")) ? a.GetAttribute("title") : a.TextContent,
                        Link        = a.TextContent
                    });
                }
            }

            return(resources);
        }
示例#2
0
        public static string ExtractRegisteredCompanyName(AngleSharp.Dom.Html.IHtmlDocument doc)
        {
            var       tableSelector        = doc.QuerySelectorAll("*");
            const int ELEMENT_SEARCH_LIMIT = 200;

            string pattern       = "^commission file n";
            int    landmarkIndex = findByRegex(tableSelector, 0, pattern, RegexOptions.IgnoreCase);

            if (landmarkIndex > ELEMENT_SEARCH_LIMIT || landmarkIndex < 0)
            {
                Console.WriteLine("Cannot find registered company landmark in first " + ELEMENT_SEARCH_LIMIT + " elements of html doc");
                return("");
            }
            int postLandmarkIndex = skipPastRegex(tableSelector, landmarkIndex, pattern, RegexOptions.IgnoreCase);

            // Sometimes there's _____ or file number, etc. in between the landmark and the company name.
            //  Skip past any elements that doesn't have at least two alphabetic characters in its text.
            postLandmarkIndex = skipPastRegex(tableSelector, postLandmarkIndex, "[^A-Za-z]{2}", RegexOptions.IgnoreCase);

            for (int i = postLandmarkIndex; postLandmarkIndex < ELEMENT_SEARCH_LIMIT; ++i)
            {
                var element = tableSelector[i];
                if (element.TextContent.Trim().Length > 0)
                {
                    return(element.TextContent.Trim());
                }
            }
            return("");
        }
示例#3
0
        public static void ExtractTableFromHTML(AngleSharp.Dom.Html.IHtmlDocument doc, IList <string> additionalLandmarks, string statementTitle, int statementInstanceIndex, string outputPath, IDictionary <string, string> rowHeadOverrides, IDictionary <string, string> config)
        {
            // Select all the DOM's elements
            var tableSelector = doc.QuerySelectorAll("*");

            int lastLandmarkIndex = 0;

            if (additionalLandmarks.Count > 0)
            {
                // Find the sequential set of landmarks that skip past any undesired occurrences of the statement title before the statement itself (ie in the TOC)
                lastLandmarkIndex = findLandmarks(tableSelector, 0, additionalLandmarks);

                // Skip past the last landmark found
                lastLandmarkIndex = skipPastLandmark(tableSelector, lastLandmarkIndex, additionalLandmarks[additionalLandmarks.Count - 1]);
            }

            // Find the actual statement title landmark
            lastLandmarkIndex = findLandmark(tableSelector, lastLandmarkIndex, statementTitle);

            // See if that landmark is contained by a table (assumed, then, to be the statement table)
            int statementTableIndex = findContainingElementByType(tableSelector, lastLandmarkIndex, "table");

            if (statementTableIndex == -1)
            {
                // Landmark is not contained in a table, so statement table is assumed to be first table following the landmark.
                statementTableIndex = findFollowingElementByType(tableSelector, lastLandmarkIndex, "table");
            }
            if (statementTableIndex == -1)
            {
                Console.WriteLine("No landmarked table found");
                return;
            }

            ExtractTableFromHTML(tableSelector, statementTableIndex, outputPath, rowHeadOverrides, config);
        }
示例#4
0
 private static void GetDataOnPageDoc(dynamic confInfo,
                                      AngleSharp.Dom.Html.IHtmlDocument page)
 {
     foreach (var element in page.QuerySelectorAll("li").Where(element => element.HasAttribute("logr")))
     {
         var houseTitle    = element.QuerySelector("h2").TextContent;
         var houseInfoList = houseTitle.Split(' ');
         int.TryParse(element.QuerySelector("b").TextContent, out var housePrice);
         var onlineUrl = $"http://{confInfo.shortcutname.Value}.58.com" + element.QuerySelector("a").GetAttribute("href");
         if (DataContent.ApartmentHouseInfos.Find(onlineUrl) != null)
         {
             continue;
         }
         var houseInfo = new ApartmentHouseInfo
         {
             HouseTitle       = houseTitle,
             HouseOnlineURL   = onlineUrl,
             DisPlayPrice     = element.QuerySelector("b").TextContent,
             HouseLocation    = new[] { "公寓", "青年社区" }.All(s => houseInfoList.Contains(s)) ? houseInfoList[0] : houseInfoList[1],
             DataCreateTime   = DateTime.Now,
             Source           = ConstConfigurationName.PinPaiGongYu,
             HousePrice       = housePrice,
             HouseText        = houseTitle,
             LocationCityName = confInfo.cityname.Value,
             PubTime          = DateTime.Now
         };
         DataContent.ApartmentHouseInfos.Add(houseInfo);
     }
 }
        private void PostSearch(IElement item, List <string> dict)
        {
            GeneralPost tweet = new GeneralPost();
            var         h     = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("AdaptiveMediaOuterContainer"));

            if (!(h.Count() == 0))
            {
                if (h.First().QuerySelectorAll("img").Count() != 0)
                {
                    tweet.Image = h.First().QuerySelectorAll("img").First().Attributes["src"].Value;
                }
            }
            long id = long.Parse(item.Attributes["data-item-id"].Value);

            tweet.Text = item.QuerySelectorAll("p").Where(k => k.ClassName.Contains("tweet-text")).First().InnerHtml;

            Cenzor cenzor = new Cenzor();

            tweet.Text = cenzor.Cenz(tweet.Text, dict);

            tweet.Social     = SocialMedia.Twitter;
            tweet.AuthorName = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("tweet")).First().Attributes["data-name"].Value;
            string linkname = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("tweet")).First().Attributes["data-screen-name"].Value;

            tweet.PostLink     = "https://twitter.com/" + linkname + "/status/" + id;
            tweet.AuthorLink   = "https://twitter.com/" + linkname;
            tweet.AuthorAvatar = item.QuerySelectorAll("img").Where(y => y.ClassName.Contains("avatar")).First().Attributes["src"].Value;
            try
            {
                var elemwithdate    = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("content")).First().QuerySelectorAll("div").Where(o => o.ClassName.Contains("stream-item-header")).First().QuerySelectorAll("small").Where(u => u.ClassName.Contains("time")).First().QuerySelectorAll("a").Where(f => f.ClassName.Contains("tweet-timestamp")).First().Attributes["title"].Value;
                var massivstrdate   = elemwithdate.Split('-');
                var massivyearmohtn = massivstrdate[1].Split(' ');
                var h1 = massivstrdate[0].TrimEnd(' ');
                var h2 = massivyearmohtn[1];
                var h3 = massivyearmohtn[2];
                var h4 = massivyearmohtn[3];
                var d  = item.QuerySelectorAll("div").Where(k => k.ClassName.Contains("content")).First().QuerySelectorAll("div").Where(o => o.ClassName.Contains("stream-item-header")).First().QuerySelectorAll("small").Where(u => u.ClassName.Contains("time")).First().QuerySelectorAll("a").Where(f => f.ClassName.Contains("tweet-timestamp")).First().QuerySelectorAll("span").Where(p => p.ClassName.Contains("_timestamp")).First().Attributes["data-time-ms"].Value;
                var s1 = h1.Split(':');
                tweet.Date = (new DateTime(Int32.Parse(h4), getMonth(h3), Int32.Parse(h2), Int32.Parse(s1[0]), Int32.Parse(s1[1]), 0));
            }
            catch {
            }
            AngleSharp.Parser.Html.HtmlParser parser       = new AngleSharp.Parser.Html.HtmlParser();
            AngleSharp.Dom.Html.IHtmlDocument htmldocument = parser.Parse(tweet.Text);

            var links = htmldocument.QuerySelectorAll("a");

            foreach (var link in links)
            {
                tweet.Text = tweet.Text.Replace(link.OuterHtml, link.InnerHtml);
            }



            lock (posts)
            {
                posts.Add(tweet);
            }
        }
        private static void AddImageClassResponsive(AngleSharp.Dom.Html.IHtmlDocument document)
        {
            var imgs = document.QuerySelectorAll("img");

            foreach (var img in imgs)
            {
                img.ClassList.Add("img-responsive");
            }
        }
示例#7
0
        static public async void PageParsing(int PageNumber)
        {
            HtmlParser parser = new HtmlParser();

            AngleSharp.Dom.Html.IHtmlDocument document = await parser.ParseAsync(DownloadPage("https://e-dostavka.by/recipe/hot/" + PageNumber + ".html"));

            blydo.BlydoName                 = document.QuerySelector("h1").TextContent;                      //название блюда
            blydo.BlydoPicture              = document.QuerySelector("img.retina_redy").GetAttribute("src"); //картинка
            blydo.IngName                   = document.QuerySelectorAll("li.not_in_cart a");
            blydo.Count                     = document.QuerySelectorAll("li.not_in_cart span");
            blydo.BlydoSP_Picture           = document.QuerySelectorAll("a.fancy_img");
            blydo.BlydoSposobPrigotovleniya = document.QuerySelectorAll("a.fancy_img");
        }
 private void ReplaceElementWithFragmentNodes(AngleSharp.Dom.Html.IHtmlDocument document, IElement inlineContentItemElement, string contentItemCodename, object inlineContentItem, string fragmentText)
 {
     try
     {
         var fragmentNodes = _strictHtmlParser.ParseFragment(fragmentText, inlineContentItemElement.ParentElement);
         inlineContentItemElement.Replace(fragmentNodes.ToArray());
     }
     catch (HtmlParseException exception)
     {
         var errorNode = document.CreateTextNode($"[Inline content item resolver provided an invalid HTML 5 fragment ({exception.Position.Line}:{exception.Position.Column}). Please check the output for a content item {contentItemCodename} of type {GetInlineContentItemType(inlineContentItem)}.]");
         inlineContentItemElement.Replace(errorNode);
     }
 }
示例#9
0
        public static string ExtractFormType(AngleSharp.Dom.Html.IHtmlDocument doc)
        {
            var       tableSelector        = doc.QuerySelectorAll("*");
            const int ELEMENT_SEARCH_LIMIT = 200;

            for (int iElement = 0; iElement < ELEMENT_SEARCH_LIMIT; ++iElement)
            {
                var   element = tableSelector[iElement];
                Match m       = Regex.Match(element.TextContent.Trim().ToUpper(), @"FORM\s+(10\s?-\s?[KQ])");
                if (m.Success)
                {
                    string capture = m.Groups[1].Captures[0].Value;
                    return(EliminateWhitespace(capture));
                }
            }
            return("");  // Not found
        }
示例#10
0
        private static List <Resource> FindResources(AngleSharp.Dom.Html.IHtmlDocument htmlDoc)
        {
            var resources = new List <Resource>();

            foreach (var li in htmlDoc.QuerySelectorAll("div.p_list").SelectMany(l => l.QuerySelectorAll("li")))
            {
                if (li.QuerySelector("a") != null && li.QuerySelector("span") != null)
                {
                    resources.Add(new Resource()
                    {
                        Description = li.QuerySelector("a").GetAttribute("title"),
                        Link        = li.QuerySelector("a.d1").GetAttribute("href"),
                    });
                }
            }

            return(resources);
        }
示例#11
0
        protected Dictionary <string, string> GetCSRFParams(AngleSharp.Dom.Html.IHtmlDocument doc)
        {
            var csrfParam = doc.QuerySelectorAll("meta[name='csrf-param']").FirstOrDefault().GetAttribute("content");

            Log.Verbose("Found CSRF Param: " + csrfParam);

            var csrfToken = doc.QuerySelectorAll("meta[name='csrf-token']").FirstOrDefault().GetAttribute("content");

            Log.Verbose("Found CSRF Token: " + csrfToken);

            var values = new Dictionary <string, string>
            {
                { "utf8", "✓" },
                { csrfParam, csrfToken }
            };

            return(values);
        }
        public List <GeneralPost> GetTweets(string html, List <string> dict)
        {
            AngleSharp.Parser.Html.HtmlParser parser       = new AngleSharp.Parser.Html.HtmlParser();
            AngleSharp.Dom.Html.IHtmlDocument htmldocument = parser.Parse(html);
            var list  = new List <string>();
            var items = htmldocument.QuerySelectorAll("li").Where(item => item.ClassName != null && item.ClassName.Contains("stream-item") && !item.ClassName.Contains("AdaptiveStream"));

            List <Thread> postThreads = new List <Thread>();

            foreach (var item in items)
            {
                Thread postThread = new Thread(() => PostSearch(item, dict));
                postThreads.Add(postThread);
                postThread.Start();
            }
            postThreads.ForEach(t => t.Join());
            return(posts);
        }
示例#13
0
        static void ExtractFromFiling(string sourceFilePath)
        {
            Console.WriteLine("Extracting from " + sourceFilePath);

            // Read and parse the filing html file
            AngleSharp.Dom.Html.IHtmlDocument htmlDoc = null;
            try
            {
                htmlDoc = ReadAndParseHtmlFile(sourceFilePath);
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception reading " + sourceFilePath + ":");
                Console.WriteLine(e);
                return;
            }
            if (htmlDoc == null)
            {
                Console.WriteLine("No data parsing " + sourceFilePath + ":");
                return;
            }

            // Extract the Registered Company Name from the filing.
            string registeredCompanyName = ExtractRegisteredCompanyName(htmlDoc);

            // Extract the filing type from the filing.
            string formType = ExtractFormType(htmlDoc);

            Console.WriteLine("Extracting Tables from Form " + formType + " for " + registeredCompanyName);

            string companyConfigurationFileName = @"config\" + registeredCompanyName + ".json";

            if (registeredCompanyName.Length > 0 && formType.Length > 0 && File.Exists(companyConfigurationFileName))
            {
                // Use configuration to guide export
                ExtractFromFilingUsingConfiguration(htmlDoc, companyConfigurationFileName);
            }
            else
            {
                // Use a more generalized approach
                ExtractFromFilingWithoutConfiguration(htmlDoc, registeredCompanyName, formType);
            }
        }
示例#14
0
        private static void LiTagsCheck(AngleSharp.Dom.Html.IHtmlDocument doc)
        {
            var firstLi = doc.All.FirstOrDefault(x => x.LocalName == "li" && (x.ParentElement == null || (x.ParentElement.LocalName != "ul" && x.ParentElement.LocalName != "ol")));

            while (firstLi != null)
            {
                var ul     = doc.CreateElement("ul");
                var parent = firstLi.ParentElement;
                if (parent.HasChildNodes)
                {
                    var allChild = parent.Children.Where(x => x.LocalName == "li");
                    foreach (var item in allChild)
                    {
                        parent.RemoveChild(item);
                        ul.AppendChild(item);
                    }
                    parent.AppendChild(ul);
                    firstLi = doc.All.FirstOrDefault(x => x.LocalName == "li" && (x.ParentElement == null || (x.ParentElement.LocalName != "ul" && x.ParentElement.LocalName != "ol")));
                }
            }
        }
 private static List <IElement> GetInlineContentItemElements(AngleSharp.Dom.Html.IHtmlDocument htmlInput)
 => htmlInput
 .Body
 .GetElementsByTagName("object")
 .Where(o => o.GetAttribute("type") == "application/kenticocloud" && o.GetAttribute("data-type") == "item")
 .ToList();
示例#16
0
        private void crawler_CrawlerComplete(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage page = e.CrawledPage;

            if (page.WebException != null || page.HttpWebResponse.StatusCode != System.Net.HttpStatusCode.OK)
            {
                Console.WriteLine("## Error on {0}", page.Uri.ToString());
                Console.WriteLine();
                AddToCrawledPages(page.Uri.ToString());
            }
            else
            {
                Console.WriteLine("Crawl OK: {0}", page.Uri.ToString());
                string currentURL = page.Uri.ToString();
                AddToCrawledPages(currentURL);

                if (IncludeMetaData)
                {
                    Console.Write("Extracting meta: ");
                    // identiyfying non page link
                    if (IsPageUrl(currentURL))
                    {
                        AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument;

                        // output
                        Models.MetaData metaDataDTO = new Models.MetaData();
                        metaDataDTO.Url = currentURL;

                        // options
                        string   titleSelector  = "title";
                        string   metaSelector   = "meta";
                        string[] metaAttributes = new[] { "description", "keywords" };

                        // title
                        AngleSharp.Dom.IElement titleElement = htmlPage.QuerySelector(titleSelector);
                        metaDataDTO.Title = (titleElement != null) ? titleElement.TextContent : "";
                        Console.Write("title ");

                        // description
                        AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> metaElements = htmlPage.QuerySelectorAll(metaSelector);
                        foreach (var item in metaElements)
                        {
                            if (item.HasAttribute("name"))
                            {
                                if (item.Attributes["name"].Value == metaAttributes[0])
                                {
                                    try
                                    {
                                        metaDataDTO.Description = item.Attributes["content"].Value;
                                    }
                                    catch (Exception)
                                    {
                                        throw;
                                    }
                                    Console.Write(metaAttributes[0] + " ");
                                }
                                if (item.Attributes["name"].Value == metaAttributes[1])
                                {
                                    try
                                    {
                                        metaDataDTO.Keywords = item.Attributes["content"].Value;
                                    }
                                    catch (Exception)
                                    {
                                        throw;
                                    }
                                    Console.Write(metaAttributes[1] + " ");
                                }
                            }
                        }

                        // add
                        MetaData.Add(metaDataDTO);

                        // output to console
                        Console.WriteLine("     OK");
                    }
                    else
                    {
                        Console.WriteLine("     NO VALID PAGE");
                    }
                }

                if (ExtractLinks)
                {
                    AngleSharp.Dom.Html.IHtmlDocument htmlPage = page.AngleSharpHtmlDocument;
                    // get all Links in class selector
                    // generate selector

                    // add prefix if it is set
                    string selector = "a";
                    if (!String.IsNullOrWhiteSpace(ClassSelector))
                    {
                        selector = ClassSelector + " " + selector;
                    }

                    // all links
                    AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> links = htmlPage.QuerySelectorAll(selector);

                    // extract links
                    List <string> linksFound = links.Select((x) =>
                    {
                        if (x.HasAttribute("href"))
                        {
                            string linkValue = x.Attributes["href"].Value;

                            // dismiss non valid values
                            if (linkValue.Contains("javascript:"))
                            {
                                return(null);
                            }
                            if (linkValue.Contains("mailto:"))
                            {
                                return(null);
                            }
                            if (linkValue.Contains("tel:"))
                            {
                                return(null);
                            }

                            // base uri
                            string baseUri = page.Uri.GetLeftPart(UriPartial.Authority);

                            if (!linkValue.StartsWith("http"))
                            {
                                linkValue = baseUri + "/" + linkValue;
                            }
                            return(linkValue);
                        }
                        return(null);
                    }
                                                            )
                                               .Where(x => !string.IsNullOrWhiteSpace(x))
                                               .ToList();

                    // store in dictionary
                    this.PagesCrawledLinks.Add(currentURL, linksFound);
                }

                // new line
                Console.WriteLine();
            }
        }
示例#17
0
        static void ExtractFromFilingWithoutConfiguration(AngleSharp.Dom.Html.IHtmlDocument htmlDoc, string registeredCompanyName, string formName)
        {
            // Select all the DOM's elements
            var tableSelector = htmlDoc.QuerySelectorAll("*");

            // See if there's a TOC
            int iElement = findByRegex(tableSelector, 0, @"^united states", RegexOptions.IgnoreCase, 1000);

            iElement = findByRegex(tableSelector, iElement, @"^table of contents|^index", RegexOptions.IgnoreCase, 1000);

            string notesSectionTitleRegexPattern        = @"^(?:Condensed )?notes to .*consolidated (?:condensed )?financial statements|^Supplemental Financial Data";
            string notesSectionTitle                    = "";
            IDictionary <string, string> statementHrefs = new Dictionary <string, string>();

            if (iElement > 0)
            {
                Console.WriteLine("Processing TOC");
                // Get key landmarks from the TOC.  TOC is bounded by the HR that follows it.
                int lastElementToConsider = iElement + 1000;        // Limit the search
                for (; iElement < lastElementToConsider && tableSelector[iElement].TagName.ToLower() != "hr"; ++iElement)
                {
                    var element = tableSelector[iElement];
                    if (element.TagName.ToLower() == "a" && Regex.IsMatch(element.TextContent, "statement", RegexOptions.IgnoreCase))
                    {
                        string href = element.GetAttribute("href");
                        statementHrefs.Add(href.Replace("#", ""), element.TextContent.Trim());
                        iElement = skipPastLandmark(tableSelector, iElement, element.TextContent);
                    }
                    if (element.TagName.ToLower() == "a" && Regex.IsMatch(element.TextContent, "balance", RegexOptions.IgnoreCase))
                    {
                        string href = element.GetAttribute("href");
                        statementHrefs.Add(href.Replace("#", ""), element.TextContent.Trim());
                        iElement = skipPastLandmark(tableSelector, iElement, element.TextContent);
                    }

                    if (Regex.IsMatch(element.TextContent, notesSectionTitleRegexPattern, RegexOptions.IgnoreCase))
                    {
                        // If we find the notes section title in the TOC we use it explicitly for the search below
                        notesSectionTitle = StandardizeWhitespace(element.TextContent.Trim());
                        iElement          = skipPastRegex(tableSelector, iElement, notesSectionTitleRegexPattern, RegexOptions.IgnoreCase);
                    }
                }
            }
            else
            {
                Console.WriteLine("No TOC found.  Skipping (for now).");
                return;
            }

            foreach (var entry in statementHrefs)
            {
                Console.WriteLine(entry.Value + " @ " + entry.Key);
            }

            foreach (var entry in statementHrefs)
            {
                string outputFileDirectory = @"c:\temp\10QParseOutput\";        // MAKE ME CONFIGURABLE
                string outputFileName      = outputFileDirectory + registeredCompanyName + "_" + formName + "_" + entry.Value + "_" + entry.Key + ".txt";

                IDictionary <string, string> rowHeadOverrideDict = new Dictionary <string, string>();
                IDictionary <string, string> parametersDict      = new Dictionary <string, string>();

                ExtractTableFromHTML(tableSelector, entry.Key, outputFileName, rowHeadOverrideDict, parametersDict);
            }

            Console.WriteLine();
            Console.ReadLine();
        }
示例#18
0
        static void InputCheck(AngleSharp.Dom.Html.IHtmlDocument doc)
        {
            LiTagsCheck(doc);
            var input = doc.QuerySelectorAll("input");

            foreach (var item in input)
            {
                if (!item.ClassList.Contains("ignore"))
                {
                    item.ClassList.Add("ignore");
                }
            }
            var scripts = doc.Body.Children.Where(x => x.LocalName == "script" && x.ParentElement.LocalName != "code");

            foreach (var item in scripts)
            {
                var parent = item.ParentElement;
                parent.RemoveChild(item);

                var code = doc.CreateElement("code");
                code.AppendChild(item);
                parent.AppendChild(code);
            }

            var scritps = doc.Scripts.ToList();

            foreach (var item in scritps)
            {
                var pre = doc.CreateElement("pre");
                pre.ClassName = "code code-javascript";
                var code = doc.CreateElement("code");
                code.InnerHtml = HttpUtility.HtmlEncode(item.OuterHtml);
                pre.AppendChild(code);
                item.Remove();
                doc.Body.AppendChild(pre);
            }
            var codes = doc.Body.Children.Where(x => x.LocalName == "code" && x.ParentElement.LocalName != "pre");

            foreach (var item in codes)
            {
                var parent = item.ParentElement;
                parent.RemoveChild(item);

                var pre = doc.CreateElement("pre");
                pre.ClassName = "code code-javascript";
                pre.AppendChild(item);
                parent.AppendChild(pre);
            }
            var pres = doc.Body.Children.Where(x => x.LocalName == "pre");

            foreach (var item in pres)
            {
                if (!item.Children.Any(x => x.LocalName == "code"))
                {
                    var code = doc.CreateElement("code");
                    foreach (var c in item.Children)
                    {
                        c.Remove();
                        code.AppendChild(c);
                    }
                    item.AppendChild(code);
                }
                if (!item.ClassList.Contains("code"))
                {
                    item.ClassList.Add("code");
                }
                if (item.TextContent.Contains("script"))
                {
                    item.ClassList.Add("code-javascript");
                }
                else if (item.TextContent.Contains("style"))
                {
                    item.ClassList.Add("code-css");
                }
            }
            var empty = doc.Body.QuerySelectorAll(":empty");

            foreach (var item in empty)
            {
                if (!"img video".Contains(item.LocalName))
                {
                    item.Remove();
                }
                else if (item.LocalName == "img")
                {
                    var src = item.GetAttribute("src");
                    if (src.Contains(";base64,"))
                    {
                        string filePath = System.IO.Path.Combine(HostingEnvironment.ApplicationPhysicalPath, "/data/img/upload");

                        var str = src.WriteImageString(filePath);
                        if (str.Success)
                        {
                            item.SetAttribute("src", str.FileName);
                        }
                    }
                }
            }
        }
示例#19
0
        public static UIBase[] ParseAttributesForUI(string html)
        {
            var options = Configuration.Default.WithCss();
            var parser  = new HtmlParser(options);

            AngleSharp.Dom.Html.IHtmlDocument doc = null;

            try
            {
                doc = parser.Parse(html);
            }
            catch (Exception)
            {
                return(null);
            }

            List <UIBase> attributes      = new List <UIBase>();
            List <UIBase> attributesNotIn = new List <UIBase>();

            int[] layout = { 0, 0, 0, 0 };

            // Get all the elements as a big flat list in list attributes.
            foreach (var item in doc.Body.Children)
            {
                ParseAttributesForUI(attributes, attributesNotIn, item, layout);
            }

            // try to find again
            foreach (var spec in attributesNotIn)
            {
                foreach (var item in attributes)
                {
                    var frame = item as UIFrame;
                    if (frame != null)
                    {
                        if (spec.IsIn(frame))
                        {
                            frame.AddChild(spec);
                        }
                    }
                }
            }

            // Group now all the root franes,
            List <UIFrame> rootframes = new List <UIFrame>();

            foreach (var candidate in attributes)
            {
                UIFrame frame_candidate = candidate as UIFrame;
                if (frame_candidate == null)
                {
                    continue;
                }
                foreach (var attribute in attributes)
                {
                    UIFrame frame = attribute as UIFrame;
                    if (frame == null)
                    {
                        continue; // not a frame so can not be root.
                    }
                    if (attribute == frame_candidate)
                    {
                        continue;                    // Is same object no need to test
                    }
                    if (frame_candidate.IsIn(frame)) // Test if the frame_candidate is part of
                    {                                //   this frame. If it is it can not be a root frame.
                        frame_candidate = null;
                        break;
                    }
                }
                if (frame_candidate != null)
                {
                    // Is a root frame
                    rootframes.Add(frame_candidate);
                }
            }

            // Now we have all the root frames.
            // remove thses frames from the attributes list
            foreach (var frame in rootframes)
            {
                attributes.Remove(frame);
            }

            // Now group all the remaining elements in the rootframes.
            UIBase elm = attributes.FirstOrDefault();

            while (elm != null)
            {
                foreach (var attribute in rootframes)
                {
                    UIFrame frame = attribute as UIFrame; // Could also be an none frame element as root.
                    if (frame == null)
                    {
                        continue;
                    }
                    if (elm.IsIn(frame))
                    {
                        frame.AddChild(elm);
                        attributes.Remove(elm);
                        elm = null;
                        break;
                    }
                }
                if (elm != null) // is a none frame that is a root.
                {
                    rootframes.Add((UIFrame)elm);
                    attributes.Remove(elm);
                }
                elm = attributes.FirstOrDefault();
            }

            UIFrame        noframe  = null;
            List <UIFrame> toremove = new List <UIFrame>();

            foreach (var item in rootframes)
            {
                UIFrame frame = item as UIFrame;
                if (frame.Children == null || frame.Children.Count() == 0)
                {
                    toremove.Add(item);
                }
                //rootframes.Remove(item);

                if (frame.NestedName == "NO_FRAME")
                {
                    noframe = frame;
                }
            }

            foreach (var item in toremove)
            {
                rootframes.Remove(item);
            }

            //rootframes.Remove(noframe);
            //rootframes.Add(noframe);

            //rootframes = rootframes.OrderBy(x => x.y).ToList();

            return(rootframes.OrderBy(x => x.y).ToArray());

            //return rootframes.ToArray();
        }
 internal AngleSharpDocument(AngleSharp.Dom.Html.IHtmlDocument Document)
 {
     this.Document = Document;
     DocumentNode  = new AngleSharpNode(Document.DocumentElement);
 }
示例#21
0
        static void ExtractFromFilingUsingConfiguration(AngleSharp.Dom.Html.IHtmlDocument htmlDoc, string companyConfigurationFileName)
        {
            string companyConfigJson;

            try
            {
                companyConfigJson = File.ReadAllText(companyConfigurationFileName);
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception on read of company configuration file " + companyConfigurationFileName);
                Console.WriteLine(e);
                return;
            }

            // Parse the company configuration JSON file and extract the salient details
            JObject config         = JObject.Parse(companyConfigJson);
            var     registeredName = getStringValueFromJObject(config, "RegisteredName");
            var     ticker         = getStringValueFromJObject(config, "Ticker");
            var     cik            = getStringValueFromJObject(config, "CIK");
            JObject jFilings       = (JObject)config["Filings"];
            JObject tenQfiling     = (JObject)jFilings["10-Q"];
            JArray  filingTables   = (JArray)tenQfiling["Tables"];

            foreach (var filingTable in filingTables.Values <JObject>())
            {
                string        statementTitle      = getStringValueFromJObject(filingTable, "StatementTitle");
                int           titleOccurrence     = (filingTable["TitleOccurence"] != null) ? ((JProperty)filingTable["TitleOccurence"]).Value <int>() : 1;
                JArray        jLandmarks          = (JArray)filingTable["AdditionalLandmarks"];
                List <string> additionalLandmarks = new List <string>();
                if (jLandmarks != null)
                {
                    foreach (var jLandmark in jLandmarks)
                    {
                        additionalLandmarks.Add(((JValue)jLandmark).ToString());
                    }
                }

                IDictionary <string, string> rowHeadOverrideDict = new Dictionary <string, string>();
                IDictionary <string, string> parametersDict      = new Dictionary <string, string>();

                JObject jOptions = (JObject)filingTable["Options"];
                if (jOptions != null)
                {
                    JArray jRowHeadOverides = (JArray)jOptions["RowHeadOverrides"];
                    foreach (var jRowHeadOverride in jRowHeadOverides.Children <JObject>())
                    {
                        var k = jRowHeadOverride.Properties().First <JProperty>().Name;
                        var v = jRowHeadOverride.Properties().First <JProperty>().Value;
                        rowHeadOverrideDict.Add(k, v.ToString());
                    }
                    JArray jParameters = (JArray)jOptions["Parameters"];
                    foreach (var jParameter in jParameters.Children <JObject>())
                    {
                        var k = jParameter.Properties().First <JProperty>().Name;
                        var v = jParameter.Properties().First <JProperty>().Value;
                        parametersDict.Add(k, v.ToString());
                    }
                }

                // Extract the current table
                string outputFileDirectory = @"c:\temp\10QParseOutput\";        // MAKE ME CONFIGURABLE
                string outputFileName      = outputFileDirectory + ticker + "_" + statementTitle + "_" + titleOccurrence + ".txt";
                Console.WriteLine("Extracting table " + statementTitle + " to " + outputFileName);
                ExtractTableFromHTML(htmlDoc, additionalLandmarks, statementTitle, titleOccurrence, outputFileName, rowHeadOverrideDict, parametersDict);
            }
        }
示例#22
0
        private List <string> Parse(BotData data)
        {
            var original = ReplaceValues(parseTarget, data);
            var partial  = original;
            var list     = new List <string>();

            // Parse the value
            switch (Type)
            {
            case ParseType.LR:
                var ls    = ReplaceValues(leftString, data);
                var rs    = ReplaceValues(rightString, data);
                var pFrom = 0;
                var pTo   = 0;

                // No L and R = return full input
                if (ls == "" && rs == "")
                {
                    list.Add(original);
                    break;
                }

                // L or R not present and not empty
                else if (((!partial.Contains(ls) && ls != "") || (!partial.Contains(rs) && rs != "")))
                {
                    break;
                }

                // Instead of the mess below, we could simply use Extreme.NET's Substring extensions
                // return original.Substrings(ls, rs); // Recursive
                // return original.Substring(ls, rs); // Not recursive

                if (recursive)
                {
                    if (useRegexLR)
                    {
                        try
                        {
                            var             pattern = BuildLRPattern(ls, rs);
                            MatchCollection mc      = Regex.Matches(partial, pattern);
                            foreach (Match m in mc)
                            {
                                list.Add(m.Value);
                            }
                        }
                        catch { }
                    }
                    else
                    {
                        try
                        {
                            while ((partial.Contains(ls) || ls == "") && (partial.Contains(rs) || rs == ""))
                            {
                                // Search for left delimiter and Calculate offset
                                pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length;
                                // Move right of offset
                                partial = partial.Substring(pFrom);
                                // Search for right delimiter and Calculate length to parse
                                pTo = rs == "" ? (partial.Length - 1) : partial.IndexOf(rs);
                                // Parse it
                                var parsed = partial.Substring(0, pTo);
                                list.Add(parsed);
                                // Move right of parsed + right
                                partial = partial.Substring(parsed.Length + rs.Length);
                            }
                        }
                        catch { }
                    }
                }

                // Non-recursive
                else
                {
                    if (useRegexLR)
                    {
                        var             pattern = BuildLRPattern(ls, rs);
                        MatchCollection mc      = Regex.Matches(partial, pattern);
                        if (mc.Count > 0)
                        {
                            list.Add(mc[0].Value);
                        }
                    }
                    else
                    {
                        try
                        {
                            pFrom   = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length;
                            partial = partial.Substring(pFrom);
                            pTo     = rs == "" ? partial.Length : partial.IndexOf(rs);
                            list.Add(partial.Substring(0, pTo));
                        }
                        catch { }
                    }
                }

                break;

            case ParseType.CSS:

                HtmlParser parser = new HtmlParser();
                AngleSharp.Dom.Html.IHtmlDocument document = null;
                try { document = parser.Parse(original); } catch {  }

                try
                {
                    if (recursive)
                    {
                        foreach (var element in document.QuerySelectorAll(ReplaceValues(cssSelector, data)))
                        {
                            switch (ReplaceValues(attributeName, data))
                            {
                            case "innerHTML":
                                list.Add(element.InnerHtml);
                                break;

                            case "outerHTML":
                                list.Add(element.OuterHtml);
                                break;

                            default:
                                foreach (var attr in element.Attributes)
                                {
                                    if (attr.Name == ReplaceValues(attributeName, data))
                                    {
                                        list.Add(attr.Value);
                                        break;
                                    }
                                }
                                break;
                            }
                        }
                    }
                    else
                    {
                        switch (ReplaceValues(attributeName, data))
                        {
                        case "innerHTML":
                            list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].InnerHtml);
                            break;

                        case "outerHTML":
                            list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].OuterHtml);
                            break;

                        default:
                            foreach (var attr in document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].Attributes)
                            {
                                if (attr.Name == ReplaceValues(attributeName, data))
                                {
                                    list.Add(attr.Value);
                                    break;
                                }
                            }
                            break;
                        }
                    }
                }
                catch { }

                break;

            case ParseType.JSON:
                if (JTokenParsing)
                {
                    JObject json     = JObject.Parse(original);
                    var     jsonlist = json.SelectTokens(jsonField, false);
                    foreach (var j in jsonlist)
                    {
                        list.Add(j.ToString());
                    }
                }
                else
                {
                    var jsonlist = new List <KeyValuePair <string, string> >();
                    parseJSON("", original, jsonlist);
                    foreach (var j in jsonlist)
                    {
                        if (j.Key == ReplaceValues(jsonField, data))
                        {
                            list.Add(j.Value);
                        }
                    }
                }

                break;

            case ParseType.XPATH:

                // NOT IMPLEMENTED YET
                break;

            case ParseType.REGEX:
                try
                {
                    var matches = Regex.Matches(partial, ReplaceValues(regexString, data));
                    foreach (Match match in matches)
                    {
                        var output = ReplaceValues(regexOutput, data);
                        for (var i = 0; i < match.Groups.Count; i++)
                        {
                            output = output.Replace("[" + i + "]", match.Groups[i].Value);
                        }
                        list.Add(output);
                    }
                }
                catch { }
                break;
            }

            return(list);
        }