public TankstellenUsecaseController(Downloader downloader, HTMLParser htmlParser, string ort, EntryAuditor entryAuditor) { this.downloader = downloader; this.htmlParser = htmlParser; downloadUrl = Url + ort; this.entryAuditor = entryAuditor; }
public static void Main(string[] args) { try { HTMLLexer lexer = new HTMLLexer(new ByteBuffer(Console.OpenStandardInput())); TokenBuffer buffer = new TokenBuffer(lexer); HTMLParser parser = new HTMLParser(buffer); parser.document(); } catch(Exception e) { Console.Error.WriteLine("exception: "+e); Environment.Exit(1); } }
private void OnLinkTag(HTMLParser instance, string tag) { _inLink = true; HashMap attrMap = instance.ParseAttributes(tag); if ((_url = (string)attrMap["href"]) == null) { _url = string.Empty; } if ((_id = (string)attrMap["id"]) == null) { _id = string.Empty; } _isFeed = attrMap["feedurl"] != null; }
public void TestHTMLParserMultiEmphasis() { var res = HTMLParser.Parse("<b><i>Hello</i></b> World").ToList(); Assert.AreEqual(res.Count(), 2); Assert.IsTrue(CheckRun((Run)res[0], "Hello", FontWeights.Bold, FontStyle.Italic, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[1], " World", FontWeights.Normal, FontStyle.Normal, TextDecorations.None)); res = HTMLParser.Parse("<b><i>Hell</i>o</b> <i>World</i>").ToList(); Assert.AreEqual(res.Count(), 4); Assert.IsTrue(CheckRun((Run)res[0], "Hell", FontWeights.Bold, FontStyle.Italic, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[1], "o", FontWeights.Bold, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[2], " ", FontWeights.Normal, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[3], "World", FontWeights.Normal, FontStyle.Italic, TextDecorations.None)); }
protected override bool FillListResponse() { /* * string myWorkAnswer = MyRequest.GetResponse; * if (myWorkAnswer == null) * return; */ if (!base.FillListResponse()) { return(false); } List <Tag> SearchResult = new List <Tag>(); List <GPB> workList = new List <GPB>(); List <Tag> HTMLDoc = HTMLParser.Parse(lastAnswer); foreach (Tag item in HTMLDoc) { if (!item.IsProto) { //SearchResult.AddRange(item.LookForChildTag("table", true)); SearchResult.AddRange(item.LookForChildTag("div", true, new KeyValuePair <string, string>("class", "procedure__data"))); } } if (SearchResult.Count < 1) { if (lastAnswer.Contains("emptyResultsBlock")) { lastError = new Exception("Поиск не дал результатов"); this.ListResponse = workList; return(false); } lastError = new Exception("Ответ сервера не содержит данных (ожидались результаты с тегом \"div\" и классом \"procedure__data\"):" + Environment.NewLine + lastAnswer); this.ListResponse = workList; return(false); } foreach (Tag item in SearchResult) { workList.Add(new GPB(item, MyRequest)); } this.ListResponse = workList; return(true); }
protected virtual void ParseHtmlContents(string source, string html, IPDFContainerComponent container, int insertIndex) { HTMLParserSettings settings = GetParserSettings(); if (this.Format == HtmlFormatType.Markdown) { Markdown md = new Markdown(); html = md.Transform(html); } HTMLParser parser = new HTMLParser(html, settings); Stack <IPDFComponent> route = new Stack <IPDFComponent>(); IPDFComponentList contents = container.Content; //int codeDepth = 0; foreach (Scryber.Html.Parsing.HTMLParserResult result in parser) { if (result.Valid && null != result.Parsed) { IPDFComponent parsed = result.Parsed; if (result.IsEnd) { route.Pop(); } else { if (route.Count == 0) { _added.Add(parsed); contents.Insert(insertIndex, parsed); insertIndex++; if (parsed is IPDFLoadableComponent) { ((IPDFLoadableComponent)parsed).LoadedSource = source; } } else { IPDFContainerComponent parent = (IPDFContainerComponent)route.Peek(); ((IPDFComponentList)parent.Content).Add(parsed); } route.Push(result.Parsed); } } } }
public void CharsetNoWordBreak() { string HTML = "<HTML><meTa httP-eQuIv=\"Content-Type\" content=\"text/html; cHaRseT=WinDowS-1251\"><BODY>1st frag</BODY></HTML>"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML))))) { parser.BreakWords = false; Assert.AreEqual(false, parser.Finished); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual(true, parser.Finished); Assert.AreEqual("windows-1251", parser.CharSet, "Invalid charset!"); } }
public void SimpleBodyNoWordBreak() { string noBodyHTML = "<HTML><HEAD><BODY>text in body</BODY>text to be ignored</HEAD></HTML>"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(noBodyHTML))))) { parser.BreakWords = false; Assert.AreEqual(false, parser.Finished); Assert.AreEqual("text in body", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual(true, parser.Finished); } }
static void Main(string[] args) { Console.WriteLine("Enter url"); string url = Console.ReadLine(); url = "https://translate.google.com/"; Console.WriteLine("Enter path"); string path = Console.ReadLine(); path = "c:\\Temp\\Links"; string depth = Console.ReadLine(); HTMLParser loader = new HTMLParser(int.Parse(depth)); loader.ParseSite(url, path); Console.ReadKey(); }
public static HtmlParseResults InvokeParse(Stream stream) { var input = new AntlrInputStream(stream); var lexer = new HTMLLexer(input); var tokens = new CommonTokenStream(lexer); var parser = new HTMLParser(tokens); var tree = parser.htmlDocument(); var walker = new ParseTreeWalker(); var loader = new AspNetParseTree(); walker.Walk(loader, tree); return(loader.Results); }
public void QuotesInTagNoWordBreak() { string HTML = "<HTML><HEAD><BODY>1st frag<P a=\"aaaa\" b=\"bbbb\"> 2nd frag </BODY></HEAD></HTML>"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML))))) { parser.BreakWords = false; Assert.AreEqual(false, parser.Finished); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual(true, parser.Finished); } }
public void FinishingOnOverclosed() { string HTML = "<HTML><HEAD><Title>The title</</</</</</</a></a></html></head></title>"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML))))) { int a; for (a = 0; (a < 0x1000) && (!parser.Finished); a++) { parser.ReadNextFragment(); } if (!(a < 1000)) { Assert.Fail("The parser has failed to finish."); } } }
public async Task <IActionResult> AddAvitoRentAdverts([FromBody] RentURLFilterModel model) { var adverts = await HTMLParser.ParseAdvertsFromURL(model.ToURLString()); foreach (AdvertModel advert in adverts) { var existingadvert = _context.Adverts.SingleOrDefault(row => row.Url == advert.Url); if (existingadvert != null) { continue; } _context.Adverts.Add(advert); await _context.SaveChangesAsync(); } return(Ok(model.ToURLString())); }
public void ScriptsNoWordBreak() { string HTML = "<HTML><HEAD><Title>The title</tITLe><script>i = 0</script></HEAD><BODY>1st frag<P a=\"aaaa\" b=\"bbbb\"><script>i = 0</script> 2nd frag </BODY></HTML>"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML))))) { parser.BreakWords = false; Assert.AreEqual(false, parser.Finished); Assert.AreEqual("The title", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("1st frag", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual(true, parser.Finished); } }
public void CharEntityReferencesNoWordBreak() { string HTML = "<body><p>include <list><p>include "omniamea.h"<p>#include «Kama—Sutra»</p></body>"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML))))) { parser.BreakWords = false; Assert.AreEqual(false, parser.Finished); Assert.AreEqual("include <list>", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("include \"omniamea.h\"", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("#include «Kama—Sutra»", parser.ReadNextFragment()); Assert.AreEqual(false, parser.Finished); Assert.AreEqual("", parser.ReadNextFragment()); Assert.AreEqual(true, parser.Finished); } }
public void FinishingOnUnclosedNoWordBreak() { string HTML = "<HTML><HEAD><Title>The title"; using (HTMLParser parser = new HTMLParser( new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML))))) { parser.BreakWords = false; int a; for (a = 0; (a < 0x1000) && (!parser.Finished); a++) { parser.ReadNextFragment(); } if (!(a < 1000)) { Assert.Fail("The parser has failed to finish."); } } }
public void TestHTMLParserSimpleEmphasis() { var res = HTMLParser.Parse("<b>Hello</b>").ToList(); Assert.AreEqual(res.Count(), 1); Assert.IsTrue(CheckRun(res.First(), "Hello", FontWeights.Bold, FontStyle.Normal, TextDecorations.None)); res = HTMLParser.Parse("<i> Hello</i>").ToList(); Assert.AreEqual(res.Count(), 1); Assert.IsTrue(CheckRun(res.First(), " Hello", FontWeights.Normal, FontStyle.Italic, TextDecorations.None)); res = HTMLParser.Parse("<u>Hello </u>").ToList(); Assert.AreEqual(res.Count(), 1); Assert.IsTrue(CheckRun(res.First(), "Hello ", FontWeights.Normal, FontStyle.Normal, TextDecorations.Underline)); res = HTMLParser.Parse("<s> Hello World </s>").ToList(); Assert.AreEqual(res.Count(), 1); Assert.IsTrue(CheckRun(res.First(), " Hello World ", FontWeights.Normal, FontStyle.Normal, TextDecorations.Strikethrough)); }
/* * protected override string CreateTableForMailing(bool html = true) * { * * return base.CreateTableForMailing(html); * } */ protected override bool FillListResponse() { /* * string myWorkAnswer = MyRequest.GetResponse; * if (myWorkAnswer == null) * return; */ if (!base.FillListResponse()) { return(false); } List <Tag> SearchResult = new List <Tag>(); List <Tag> HTMLDoc = HTMLParser.Parse(lastAnswer); foreach (Tag item in HTMLDoc) { if (!item.IsProto) { SearchResult.AddRange(item.LookForChildTag("span", true, new KeyValuePair <string, string>("class", "teaser teaser-product"))); } } // List <LotOnlineSales> workList = new List <LotOnlineSales>(); foreach (Tag item in SearchResult) { workList.Add(new LotOnlineSales(item, MyRequest)); } this.ListResponse = workList; tableHead = new string[] { "№", "Наименование", "Цена" }; return(true); }
public RJOutline(String docs) { HTMLParser p = HTMLParser.GetByHTML(docs); NodeList nodes = p.GetFirstNode("id", "work_outline").Children; nodes.KeepAllNodesThatMatch(new TagNameFilter("tr")); for (int i = 0; i < nodes.Count; i++) { INode node = nodes.ElementAt(i); if (node != null) { node.Children.RemoveMeaninglessNodes(); this.data.Add(node.FirstChild.ToPlainTextStringEx().Trim(), node.LastChild.ToDividedTextString(" ").TrimAll()); } } }
public void TestHTMLParserMultiNode() { var res = HTMLParser.Parse("<b>Hello</b> World").ToList(); Assert.AreEqual(res.Count(), 2); Assert.IsTrue(CheckRun((Run)res[0], "Hello", FontWeights.Bold, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[1], " World", FontWeights.Normal, FontStyle.Normal, TextDecorations.None)); res = HTMLParser.Parse("<b>Hello</b> World <i>!!!</i>").ToList(); Assert.AreEqual(res.Count(), 3); Assert.IsTrue(CheckRun((Run)res[0], "Hello", FontWeights.Bold, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[1], " World ", FontWeights.Normal, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[2], "!!!", FontWeights.Normal, FontStyle.Italic, TextDecorations.None)); res = HTMLParser.Parse("Welcome <b>Hello</b> World").ToList(); Assert.AreEqual(res.Count(), 3); Assert.IsTrue(CheckRun((Run)res[0], "Welcome ", FontWeights.Normal, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[1], "Hello", FontWeights.Bold, FontStyle.Normal, TextDecorations.None)); Assert.IsTrue(CheckRun((Run)res[2], " World", FontWeights.Normal, FontStyle.Normal, TextDecorations.None)); }
public void BuildDocument() { if (this.SelectedElementAttributesString != string.Empty) { List <string> properties = HTMLParser.GetElementAttributes(this.SelectedElementAttributesString); this.SelectedElement.Attributes = new List <string>(properties); } else { this.SelectedElement.Attributes = new List <string>(); } string html = string.Empty; foreach (ElementModel element in this.Elements) { html = $"{html}{element}"; } this.Document = $"<!DOCTYPE html>{html}"; }
public async void PlayAsync(string web_url, string pid) { btnPlay.Content = "Pause"; btnPlay.Visibility = Visibility.Hidden; PlayProgressRing.Visibility = Visibility.Visible; PlayerSlider.Visibility = Visibility.Visible; SaavnPageRequest pageRequest = new SaavnPageRequest(); System.Diagnostics.Debug.Write("Fetching HTML : " + web_url); string html = await pageRequest.MakeRequest(web_url); string enc_media_url = HTMLParser.GetEncryptedURL(html, pid); string mediaUrl = Decrypto.GetDESDecryptedUrl(enc_media_url); if (mediaUrl == null) { btnPlay.Visibility = Visibility.Visible; PlayProgressRing.Visibility = Visibility.Hidden; btnPlay.Content = "Unavailable"; btnPlay.IsEnabled = false; return; } if (songPlayer == null) { songPlayer = new MediaPlayer(); songPlayer.MediaEnded += songPlayer_MediaEnded; } songPlayer.MediaOpened += songPlayer_MediaOpened; Uri uri = new Uri(mediaUrl); songPlayer.Open(uri); songPlayer.Play(); songPlayer.Volume = 1; btnPlay.Visibility = Visibility.Visible; PlayProgressRing.Visibility = Visibility.Hidden; }
public static void test() { SterlingEngine engine = new SterlingEngine(); engine.Activate(); var dbInstance = engine.SterlingDatabase.RegisterDatabase <WeaponDBInstance>( new FileSystemDriver("weaponDB/")); dbInstance.Purge(); HTMLParser p = new HTMLParser(@"/wiki/List_of_battle_rifles", @"https://en.wikipedia.org"); var x = p.ProcessArticle().ToList(); x.ForEach(w => { dbInstance.Save <Weapon>(w); }); var data = dbInstance.Query <Weapon, string>(); //Weapon weapon = dbInstance.Load<Weapon>("AK-72"); }
public void Open() { OpenFileDialog openFileDialog = new OpenFileDialog { Filter = "HTML Files|*.html" }; if (openFileDialog.ShowDialog() == true) { string document; using (StreamReader sr = new StreamReader(openFileDialog.FileName)) { document = sr.ReadToEnd(); } this.Elements = new ObservableCollection <ElementModel>(HTMLParser.Parse(document)); this.SelectedElement = this.Elements[0]; this.BuildDocument(); this.FilePath = openFileDialog.FileName; this.OpenInBrowserAvailable = true; } }
public void VisitorTest() { try { string input = "<html><title>blahg blah blah</title><p>para graph</p><p>second para</p></html>"; StringBuilder text = new StringBuilder(input); Console.WriteLine(input); var inputStream = new AntlrInputStream(text.ToString()); var htmlLexer = new HTMLLexer(inputStream); var commonTokenStream = new CommonTokenStream(htmlLexer); var htmlParser = new HTMLParser(commonTokenStream); var htmlContext = htmlParser.htmlDocument(); var visitor = new HTMLVisitor(); visitor.Visit(htmlContext); } catch (Exception ex) { Console.WriteLine("Error: " + ex); } }
/// <summary> /// Invokes parser repeatedly to read all the fragments. /// Writes the fragments to a string, separates them with spaces (trailing space is added too!). /// </summary> /// <param name="parser"></param> /// <returns></returns> private string ReadAllFragments(HTMLParser parser) { StringBuilder sb = new StringBuilder(); while (!parser.Finished) { sb.Append(parser.ReadNextFragment()); } try { if (parser.ReadNextFragment().Length != 0) { throw new InvalidOperationException("Parser must return an empty fragment having read the whole text (if there's a tag after the last returned meaningful string)."); } throw new InvalidOperationException("Parser must throw an exception if reading beyond end of stream."); } catch (EndOfStreamException) // It's expected { } return(sb.ToString()); }
protected override bool FillListResponse() { /* * string myWorkAnswer = MyRequest.GetResponse; * if (myWorkAnswer == null) * return; */ if (!base.FillListResponse()) { return(false); } // List <Tag> SearchResult = new List <Tag>(); List <Tag> HTMLDoc = HTMLParser.Parse(lastAnswer); foreach (Tag item in HTMLDoc) { if (!item.IsProto) { //SearchResult.AddRange(item.LookForChildTag("ul", true, new KeyValuePair<string, string>("class", "component-list lot-catalog__list"))); SearchResult.AddRange(item.LookForChildTag("li", true, new KeyValuePair <string, string>("class", "component-list__item lot-catalog__list-item"))); } } // List <TorgASV> workList = new List <TorgASV>(); foreach (Tag item in SearchResult) { workList.Add(new TorgASV(item, MyRequest)); } this.ListResponse = workList; return(true); }
public void Reset() { FileInfo[] bookmarkFiles = IOTools.GetFiles(_path, "bookmarks.html"); if (bookmarkFiles == null || bookmarkFiles.Length == 0) { _parser = null; } else { _parser = new HTMLParser(new StreamReader(bookmarkFiles[0].FullName), true); _parser.BreakWords = false; _parser.AddTagHandler("dl", new HTMLParser.TagHandler(OnDLTag)); _parser.AddTagHandler("/dl", new HTMLParser.TagHandler(OnDLClosedTag)); _parser.AddTagHandler("h3", new HTMLParser.TagHandler(OnHeaderTag)); _parser.AddTagHandler("/h3", new HTMLParser.TagHandler(OnHeaderClosedTag)); _parser.AddTagHandler("a", new HTMLParser.TagHandler(OnLinkTag)); _parser.AddTagHandler("/a", new HTMLParser.TagHandler(OnLinkClosedTag)); _parser.AddTagHandler("dd", new HTMLParser.TagHandler(OnDescriptionTag)); _level = 0; _inHeader = _inLink = _inDescription = false; } }
private void reconSchedulerMethod(object sender, EventArgs e) { if (ICEDownloaderSwitch.Value == 1) { var a = new HTMLParser(@"C:\Users\Akhand\Documents\Visual Studio 2015\Projects\ExchangeRecon\ExchangeRecon\AppFiles\Download Data\ICE_CBNA.xls").Process().ToList(); var b = new HTMLParser(@"C:\Users\Akhand\Documents\Visual Studio 2015\Projects\ExchangeRecon\ExchangeRecon\AppFiles\Download Data\ICE_CGML.xls").Process().ToList(); foreach (DataTable t in b) { a.Add(t); } Console.WriteLine("Num of tables extracted : " + a.Count); var data = new DataTable("ICE Data"); MergeTables(a, data); AddPKtoICE(data); ICEqueues["ICE Raw Collated"].QData = data; ICEqueues["ICE Raw Collated"].ToCSV(); } Console.WriteLine("Chal gaya hu"); ICEReconTimer.Stop(); if (ReconSwitch.Value == 1) { ICEReconStatus.Content = "Running"; // Add the Recon Method ReconQueue comparisons = ICEqueues["Comparisons"]; foreach (DataRow comp in comparisons.QData.Rows) { queryTrials(comp, ICEqueues); } Console.WriteLine("Analysis done"); foreach (KeyValuePair <string, ReconQueue> q in ICEqueues) { q.Value.ToCSV(); } DateTime now = TimeZoneInfo.ConvertTimeFromUtc(System.DateTime.UtcNow, TimeZoneInfo.Local); ICEReconStatus.Content = "Last executed at " + now.ToString("dd-mmm-yy hh:mm:ss"); } }
private static async Task <IEnumerable <Product> > ParseAsync(ParserSource source) { // Download web request string url = source.Url; string responceBody = await(new HttpDownloader(url, null, null).GetPageAsync()); // Create an appropriate parser IClassParser <ParserInput, Product> parser; if (source.ParserId == 0) { parser = new HTMLParser <ParserInput, Product>(responceBody); } else { parser = new JsonParser <ParserInput, Product>(responceBody); } // Create an input ParserInput input = new ParserInput(source, source.Market); return(parser.Parse(input)); }
public void Parse(Stream stream, Action <string> callback) { var inputStream = new AntlrInputStream(stream); var lexer = new HTMLLexer(inputStream); lexer.RemoveErrorListeners(); var tokenStream = new CommonTokenStream(lexer); var parser = new HTMLParser(tokenStream); parser.RemoveErrorListeners(); var context = parser.htmlDocument(); var listener = new HTMLKeywordParserListener() { Emit = (x) => callback(x), MinimumLength = MinimumLength, MaximumLength = MaximumLength, IncludeChardata = IncludeChardata, IncludeComments = IncludeComments }; var walker = new ParseTreeWalker(); walker.Walk(listener, context); }
protected override bool FillListResponse() { /* * string myWorkAnswer = MyRequest.GetResponse; * if (myWorkAnswer == null) * return; */ if (!base.FillListResponse()) { return(false); } // List <Tag> SearchResult = new List <Tag>(); List <Tag> HTMLDoc = HTMLParser.Parse(lastAnswer); foreach (Tag item in HTMLDoc) { if (!item.IsProto) { SearchResult.AddRange(item.LookForChildTag("div", true, new KeyValuePair <string, string>("class", "row no-gutters registry-entry__form mr-0"))); } } // List <ZakupkiGov> workList = new List <ZakupkiGov>(); foreach (Tag item in SearchResult) { workList.Add(new ZakupkiGov(item, MyRequest)); } this.ListResponse = workList; return(true); }
public override void ExitAssignedAttr(HTMLParser.AssignedAttrContext ctx) { var attrNameCtx = ctx.htmlAttributeName(); if (attrNameCtx == null) { return; } var attrValueCtx = ctx.htmlAttributeValue(); if (attrValueCtx == null) { return; } var attrNameText = MyTreeProperty.Get(attrNameCtx); if (string.IsNullOrEmpty(attrNameText)) { return; } var attrValueText = MyTreeProperty.Get(attrValueCtx); if (string.IsNullOrEmpty(attrValueText)) { return; } //TODO swap attributes name & values here if (attrValueText.Length <= 2) FilteredPut(ctx, attrNameText + "=" + attrValueText); else { var openQuot = attrValueText.Substring(0, 1); var closeQuot = attrValueText.Substring(attrValueText.Length - 1, 1); var attrInnerValue = attrValueText.Substring(1, attrValueText.Length - 2).EscapeString(EscapeStringType.XML); FilteredPut(ctx, attrNameText + "=" + openQuot + attrInnerValue + closeQuot); } }
public override void ExitHtmlContent(HTMLParser.HtmlContentContext ctx) { var textContent = new StringBuilder(); foreach(var elemCtx in ctx.htmlElement()){ var elemText = MyTreeProperty.Get(elemCtx); if(string.IsNullOrEmpty(elemText)){ continue; } textContent.Append(elemText); } foreach (var charDataCtx in ctx.htmlChardata()) { var cdataText = MyTreeProperty.Get(charDataCtx); if (string.IsNullOrWhiteSpace(cdataText)) continue; textContent.Append(cdataText); } FilteredPut(ctx, textContent.ToString()); }
public override void ExitStyle(HTMLParser.StyleContext ctx) { const string SHORT_BODY = "</>"; const string BODY = "</style>"; var styleBody = ctx.STYLE_BODY() ?? ctx.STYLE_SHORT_BODY(); var styleBodyText = styleBody?.GetText(); if (string.IsNullOrWhiteSpace(styleBodyText)) return; if (styleBodyText.EndsWith(SHORT_BODY)) styleBodyText = styleBodyText.Substring(0, styleBodyText.Length - SHORT_BODY.Length); if (styleBodyText.EndsWith(BODY)) styleBodyText = styleBodyText.Substring(0, styleBodyText.Length - BODY.Length); styleBodyText = styleBodyText.Trim(); _results.StyleBodies.Add(styleBodyText); }
public override void ExitEmptyAttr(HTMLParser.EmptyAttrContext ctx) { var attrNameCtx = ctx.htmlAttributeName(); if (attrNameCtx == null) { return; } var attrNameText = MyTreeProperty.Get(attrNameCtx); if (string.IsNullOrEmpty(attrNameText)) { return; } //TODO swap attributes name with name-value pair here if(_results.EmptyAttrs.All(x => attrNameText != x)) _results.EmptyAttrs.Add(attrNameText); FilteredPut(ctx, attrNameText + "='true'"); }
public override void ExitHtmlAttributeValue(HTMLParser.HtmlAttributeValueContext ctx) { var attrValue = ctx.ATTVALUE_VALUE(); if (attrValue == null) { return; } var attrValueText = attrValue.GetText(); FilteredPut(ctx, attrValueText); }
public override void ExitHtmlAttributeName(HTMLParser.HtmlAttributeNameContext ctx) { var attrName = ctx.TAG_NAME(); if (attrName == null) { return; } var attrNameText = attrName.GetText(); FilteredPut(ctx, attrNameText); }
public override void ExitScriptlet(HTMLParser.ScriptletContext ctx) { var scriptLetNode = ctx.SCRIPTLET(); var scriptletText = scriptLetNode?.GetText(); if (string.IsNullOrWhiteSpace(scriptletText)) return; _results.ScriptLets.Add(scriptletText); }
public override void ExitHtmlChardata(HTMLParser.HtmlChardataContext context) { if (string.IsNullOrWhiteSpace(context.GetText())) return; var cdatatxt = context.GetText().Trim(); cdatatxt = cdatatxt.EscapeString(EscapeStringType.XML); _results.CharData.Add(cdatatxt); FilteredPut(context, cdatatxt); }
public override void ExitHtmlComment(HTMLParser.HtmlCommentContext context) { var htmlCommentNode = context.HTML_COMMENT(); var htmlCommentText = htmlCommentNode?.GetText(); if (string.IsNullOrWhiteSpace(htmlCommentText)) return; _results.HtmlComments.Add(htmlCommentText); }
public override void ExitScript(HTMLParser.ScriptContext ctx) { const string SHORT_BODY = "</>"; const string BODY = "</script>"; var scriptBodyNode = ctx.SCRIPT_BODY() ?? ctx.SCRIPT_SHORT_BODY(); var scriptBodyText = scriptBodyNode?.GetText(); if (string.IsNullOrWhiteSpace(scriptBodyText)) return; if (scriptBodyText.EndsWith(SHORT_BODY)) scriptBodyText = scriptBodyText.Substring(0, scriptBodyText.Length - (SHORT_BODY.Length)); if (scriptBodyText.EndsWith(BODY)) scriptBodyText = scriptBodyText.Substring(0, scriptBodyText.Length - (BODY.Length)); scriptBodyText = scriptBodyText.Trim(); _results.ScriptBodies.Add(scriptBodyText); }
public override void ExitPairElement(HTMLParser.PairElementContext ctx) { var textContent = new StringBuilder(); textContent.Append("<"); var tagNameCtx = ctx.htmlTagName(0); var tagNameNode = tagNameCtx?.TAG_NAME(); var tagNameText = tagNameNode?.GetText(); if(string.IsNullOrEmpty(tagNameText)){ return; } //add this tag name to the list if its not already present if (!_results.Tags2Attrs.ContainsKey(tagNameText)) { _results.Tags2Attrs.Add(tagNameText, new List<string>()); } var tagContents = new List<string> {tagNameText}; foreach(var attrCtx in ctx.htmlAttribute()){ var attrText = MyTreeProperty.Get(attrCtx); if(string.IsNullOrEmpty(attrText)){ continue; } //add this tag's attributes if its not already present as-is if (_results.Tags2Attrs[tagNameText].All(a => a != attrText)) { _results.Tags2Attrs[tagNameText].Add(attrText); } tagContents.Add(attrText); } textContent.Append(string.Join(" ", tagContents)); textContent.Append(">"); var contentCtx = ctx.htmlContent(); if(contentCtx != null){ var contentsText = MyTreeProperty.Get(contentCtx); if(!string.IsNullOrEmpty(contentsText)){ textContent.Append(contentsText); } } textContent.Append("</"); textContent.Append(tagNameText); textContent.Append(">"); textContent.Append("\n"); FilteredPut(ctx, textContent.ToString()); }
public override void ExitHtmlDocument(HTMLParser.HtmlDocumentContext ctx) { var textContent = new StringBuilder(); foreach(var elemsCtx in ctx.htmlElements()){ var markup = MyTreeProperty.Get(elemsCtx); if(string.IsNullOrEmpty(markup)){ continue; } textContent.Append(markup); } _results.HtmlOnly = textContent.ToString(); }
public override void ExitHtmlElements(HTMLParser.HtmlElementsContext ctx) { var topNodeCtx = ctx.htmlElement(); var expectedFullMarkup = MyTreeProperty.Get(topNodeCtx); FilteredPut(ctx, expectedFullMarkup); }
public override void ExitDtd(HTMLParser.DtdContext context) { var dtdNode = context.DTD(); var dtdNodeText = dtdNode?.GetText(); if (string.IsNullOrWhiteSpace(dtdNodeText)) return; _results.DtdNodes.Add(dtdNodeText); }
public static HtmlParseResults InvokeParse(Stream stream) { var input = new AntlrInputStream(stream); var lexer = new HTMLLexer(input); var tokens = new CommonTokenStream(lexer); var parser = new HTMLParser(tokens); var tree = parser.htmlDocument(); var walker = new ParseTreeWalker(); var loader = new AspNetParseTree(); walker.Walk(loader, tree); return loader.Results; }