/// <summary> /// Gets only the body of all script tags from the web response content /// </summary> /// <param name="webResponseText"></param> /// <param name="filter"></param> /// <param name="scriptBodies"></param> /// <returns></returns> public static bool TryGetScriptBodies(string webResponseText, Func <string, bool> filter, out string[] scriptBodies) { scriptBodies = null; if (String.IsNullOrWhiteSpace(webResponseText)) { return(false); } var ms = new MemoryStream(Encoding.UTF8.GetBytes(webResponseText)); var antlrHtml = AspNetParseTree.InvokeParse(ms); if (antlrHtml == null) { return(false); } var innerText = antlrHtml.CharData; if (innerText.Count <= 0) { return(false); } scriptBodies = antlrHtml.ScriptBodies.ToArray(); if (filter != null) { scriptBodies = scriptBodies.Where(filter).ToArray(); } return(scriptBodies.Length > 0); }
/// <summary> /// Strips the content of <see cref="rawHtml"/> down to /// just its html with no scrips, css styles, doc-types, etc. /// </summary> /// <param name="rawHtml"></param> /// <returns></returns> public static string GetHtmlOnly(string rawHtml) { if (String.IsNullOrWhiteSpace(rawHtml)) { return(null); } var antlrRslts = AspNetParseTree.InvokeParse(new MemoryStream(Encoding.UTF8.GetBytes(rawHtml))); return(antlrRslts?.HtmlOnly); }
public override IEnumerable <dynamic> ParseContent(object content) { var webResponseBody = GetWebResponseBody(content); if (webResponseBody == null) { return(null); } var ms = new MemoryStream(Encoding.UTF8.GetBytes(webResponseBody)); var htmlRslts = AspNetParseTree.InvokeParse(ms); if (string.IsNullOrWhiteSpace(htmlRslts?.HtmlOnly)) { return(null); } var xml = new XmlDocument(); xml.LoadXml(htmlRslts.HtmlOnly); var insComNameNodes = xml.SelectNodes("//div[contains(text(),'Main article')]/following-sibling::ul/li/a"); if (insComNameNodes == null || insComNameNodes.Count <= 0) { return(null); } var insComNames = new List <string>(); foreach (var node in insComNameNodes) { var elem = node as XmlElement; var name = elem?.InnerText; if (string.IsNullOrWhiteSpace(name) || insComNames.Contains(name) || _skipThese.Contains(name)) { continue; } insComNames.Add(name); } insComNames.Sort(); return(new List <dynamic> { new { UsInsComNames = insComNames.ToArray() } }); }
public void TestPoc() { var testFile = PutTestFileOnDisk("AccountEdit_aspx.eg"); Assert.IsTrue(System.IO.File.Exists(testFile)); var testResult = AspNetParseTree.InvokeParse(testFile); Assert.IsNotNull(testResult); Assert.IsNotNull(testResult.Tags2Attrs); Assert.AreNotEqual(0, testResult.Tags2Attrs.Keys.Count); foreach (var key in testResult.Tags2Attrs.Keys) { //var attrs = string.Join("|", testResult.DistinctTags[key]); Console.WriteLine(key); } Assert.IsNotNull(testResult.ScriptBodies); Assert.AreNotEqual(0, testResult.ScriptBodies.Count); foreach (var script in testResult.ScriptBodies) { Console.WriteLine(script); } }
public override IEnumerable <dynamic> ParseContent(object content) { var webResponseBody = GetWebResponseBody(content); if (webResponseBody == null) { return(null); } var ms = new MemoryStream(Encoding.UTF8.GetBytes(webResponseBody)); var htmlRslts = AspNetParseTree.InvokeParse(ms); if (htmlRslts?.Tags2Attrs == null || !htmlRslts.Tags2Attrs.ContainsKey("a")) { return(null); } var xrblUri = GetXbrlXmlPartialUri(htmlRslts.Tags2Attrs["a"]); if (string.IsNullOrWhiteSpace(xrblUri)) { return(null); } string irsId; TryGetGetIrsId(htmlRslts, out irsId); return(new List <dynamic> { new { XrblUri = Edgar.SEC_ROOT_URL + xrblUri, IrsId = irsId } }); }