public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { string sizeAttr = atts ["size"].Value; if (sizeAttr != null) { Matcher m = CommonTagActions.PAT_FONT_SIZE.Matcher(sizeAttr); if (m.Matches()) { string rel = m.Group(1); int val = System.Convert.ToInt32(m.Group(2)); int size; if (rel.Length == 0) { // absolute size = val; } else { // relative int?prevSize; if (instance.fontSizeStack.IsEmpty()) { prevSize = 3; } else { prevSize = 3; foreach (int?s in instance.fontSizeStack) { if (s != null) { prevSize = s; break; } } } if (rel[0] == '+') { size = (int)prevSize + val; } else { size = (int)prevSize - val; } } instance.fontSizeStack.Add(0, size); } else { instance.fontSizeStack.Add(0, null); } } else { instance.fontSizeStack.Add(0, null); } return(false); }
public bool End(NBoilerpipeContentHandler instance, string localName) { if (--instance.inAnchor == 0) { if (instance.inIgnorableElement == 0) { instance.AddWhitespaceIfNecessary(); instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_END); instance.tokenBuilder.Append(' '); instance.sbLastWasWhitespace = true; } } return(false); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { IList <string> labels = new AList <string>(5); labels.AddItem(DefaultLabels.MARKUP_PREFIX + localName); string classVal = atts["class"].Value; if (classVal != null && classVal.Length > 0) { classVal = PAT_NUM.Matcher(classVal).ReplaceAll("#"); classVal = classVal.Trim(); string[] vals = classVal.Split("[ ]+"); labels.AddItem(DefaultLabels.MARKUP_PREFIX + "." + classVal.Replace(' ', '.')); if (vals.Length > 1) { foreach (string s in vals) { labels.AddItem(DefaultLabels.MARKUP_PREFIX + "." + s); } } } var att = atts["id"]; var id = (atts != null) ? att.Name : ""; if (id != null && id.Length > 0) { id = PAT_NUM.Matcher(id).ReplaceAll("#"); labels.AddItem(DefaultLabels.MARKUP_PREFIX + "#" + id); } ICollection <string> ancestors = GetAncestorLabels(); IList <string> labelsWithAncestors = new AList <string>((ancestors.Count + 1) * labels .Count); foreach (string l in labels) { foreach (string an in ancestors) { labelsWithAncestors.AddItem(an); labelsWithAncestors.AddItem(an + " " + l); } labelsWithAncestors.AddItem(l); } instance.AddLabelAction(new LabelAction(Sharpen.Collections.ToArray(labelsWithAncestors , new string[labelsWithAncestors.Count]))); labelStack.AddItem(labels); return(isBlockLevel); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { if (instance.inAnchor++ > 0) { // as nested A elements are not allowed per specification, we // are probably reaching this branch due to a bug in the XML // parser this.End(instance, localName); } if (instance.inIgnorableElement == 0) { instance.AddWhitespaceIfNecessary(); instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_START); instance.tokenBuilder.Append(' '); instance.sbLastWasWhitespace = true; } return(false); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { if (instance.inAnchor++ > 0) { // as nested A elements are not allowed per specification, we // are probably reaching this branch due to a bug in the XML // parser System.Console.Error.WriteLine("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..." ); this.End(instance, localName); } if (instance.inIgnorableElement == 0) { instance.AddWhitespaceIfNecessary(); instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_START); instance.tokenBuilder.Append(' '); instance.sbLastWasWhitespace = true; } return(false); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { IList<string> labels = new AList<string> (5); labels.AddItem (DefaultLabels.MARKUP_PREFIX + localName); string classVal = atts ["class"].Value; if (classVal != null && classVal.Length > 0) { classVal = PAT_NUM.Matcher (classVal).ReplaceAll ("#"); classVal = classVal.Trim (); string[] vals = classVal.Split ("[ ]+"); labels.AddItem (DefaultLabels.MARKUP_PREFIX + "." + classVal.Replace (' ', '.')); if (vals.Length > 1) { foreach (string s in vals) { labels.AddItem (DefaultLabels.MARKUP_PREFIX + "." + s); } } } var att = atts["id"]; var id = ( atts !=null) ? att.Name : ""; if (id != null && id.Length > 0) { id = PAT_NUM.Matcher (id).ReplaceAll ("#"); labels.AddItem (DefaultLabels.MARKUP_PREFIX + "#" + id); } ICollection<string> ancestors = GetAncestorLabels (); IList<string> labelsWithAncestors = new AList<string> ((ancestors.Count + 1) * labels .Count); foreach (string l in labels) { foreach (string an in ancestors) { labelsWithAncestors.AddItem (an); labelsWithAncestors.AddItem (an + " " + l); } labelsWithAncestors.AddItem (l); } instance.AddLabelAction (new LabelAction (Sharpen.Collections.ToArray (labelsWithAncestors , new string[labelsWithAncestors.Count]))); labelStack.AddItem (labels); return isBlockLevel; }
public NBoilerpipeHtmlParser(NBoilerpipeContentHandler contentHandler) { this.contentHandler = contentHandler; }
public bool Start(NBoilerpipeContentHandler instance, string localName,HtmlAttributeCollection atts) { instance.inIgnorableElement++; return true; }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.AddLabelAction(action); return true; }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.AddWhitespaceIfNecessary(); instance.AddLabelAction(action); return false; }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.fontSizeStack.RemoveAt(0); return false; }
public bool Start (NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { string sizeAttr = atts ["size"].Value; if (sizeAttr != null) { var m = CommonTagActions.PAT_FONT_SIZE.Matcher(sizeAttr); if (m.Matches()) { string rel = m.Group(1); int val = System.Convert.ToInt32(m.Group(2)); int size; if (rel.Length == 0) { // absolute size = val; } else { // relative int? prevSize; if (instance.fontSizeStack.Count == 0) { prevSize = 3; } else { prevSize = 3; foreach (int? s in instance.fontSizeStack) { if (s != null) { prevSize = s; break; } } } if (rel[0] == '+') { size = (int)prevSize + val; } else { size = (int)prevSize - val; } } instance.fontSizeStack.Insert(0, size); } else { instance.fontSizeStack.Insert(0, null); } } else { instance.fontSizeStack.Insert(0, null); } return false; }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.FlushBlock(); instance.inBody--; return false; }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.AddLabelAction(action); return(true); }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.AddWhitespaceIfNecessary(); return(false); }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.AddWhitespaceIfNecessary(); instance.AddLabelAction(action); return(false); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { return(t1.Start(instance, localName, atts) | t2.Start(instance, localName, atts)); }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.fontSizeStack.RemoveFirst(); return(false); }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { return(false); }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.FlushBlock(); instance.inBody--; return(false); }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.FlushBlock(); instance.inBody++; return false; }
public bool End(NBoilerpipeContentHandler instance, string localName) { return(true); }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { return false; }
/// <exception cref="Sharpen.SAXException"></exception> public bool End(NBoilerpipeContentHandler instance, string localName) { return(t1.End(instance, localName) | t2.End(instance, localName)); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { return t1.Start(instance, localName, atts) | t2.Start(instance, localName,atts); }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.inIgnorableElement++; return(true); }
/// <exception cref="Sharpen.SAXException"></exception> public bool End(NBoilerpipeContentHandler instance, string localName) { return t1.End(instance, localName) | t2.End(instance, localName); }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.inIgnorableElement--; return(true); }
public bool End(NBoilerpipeContentHandler instance, string localName) { instance.AddWhitespaceIfNecessary(); return false; }
/// <exception cref="Sharpen.SAXException"></exception> public bool End(NBoilerpipeContentHandler instance, string localName) { labelStack.RemoveLast(); return(isBlockLevel); }
public bool End(NBoilerpipeContentHandler instance, string localName) { return true; }
/// <exception cref="Sharpen.SAXException"></exception> public bool End(NBoilerpipeContentHandler instance, string localName) { labelStack.RemoveLast(); return isBlockLevel; }
public bool End(NBoilerpipeContentHandler instance, string localName ) { instance.inIgnorableElement--; return true; }
public bool End (NBoilerpipeContentHandler instance, string localName) { if (--instance.inAnchor == 0) { if (instance.inIgnorableElement == 0) { instance.AddWhitespaceIfNecessary(); instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_END); instance.tokenBuilder.Append(' '); instance.sbLastWasWhitespace = true; } } else instance.inIgnorableElement--; return false; }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { try { var alt = atts.Contains("alt") ? atts["alt"].Value : ""; if (alt.Length < 5) { alt = (atts.Contains("title") ? atts["title"].Value : alt); } int width = Math.Max(atts.Contains("width") ? int.Parse(atts["width"].Value.TrimEnd('p', 'x', ';')) : 0, 1); int height = Math.Max(atts.Contains("height") ? int.Parse(atts["height"].Value.TrimEnd('p', 'x', ';')) : 0, 1); var src = atts.Contains("src") ? atts["src"].Value : FindAlternateSrc(atts); bool isWikimedia = false; if (instance.inIgnorableElement <= 0 && !string.IsNullOrWhiteSpace(src) && (alt.Length > 5 || width > 400 || height > 320 || (isWikimedia = src.StartsWith("//upload.wikimedia.org")))) { var altWidthHeight = FindAlternateWidthHieght(src); width = Math.Max(altWidthHeight.Item1, width); height = Math.Max(altWidthHeight.Item2, height); if (src.StartsWith("//")) src = "http:" + src; if (width > 400 || height > 320 || isWikimedia) { var tb = new Document.TextBlock("", new Sharpen.BitSet(), Math.Max((Math.Max(width, height) / 6), alt.Length), 0, 0, 0, 0, src); tb.SetIsContent(true); instance.textBlocks.Add(tb); } } instance.inIgnorableElement++; return true; } catch(Exception ex) { Debug.WriteLine("during boilerpipe parsing: " + ex.ToString()); } instance.inIgnorableElement++; return true; }
public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { instance.FlushBlock(); instance.inBody++; return(false); }
/// <exception cref="Sharpen.SAXException"></exception> public bool Start (NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts) { if (instance.inAnchor++ > 0) { // as nested A elements are not allowed per specification, we // are probably reaching this branch due to a bug in the XML // parser //System.Console.Error.WriteLine ("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..." //); //this.End (instance, localName); instance.inIgnorableElement++; } if (instance.inIgnorableElement == 0) { instance.AddWhitespaceIfNecessary (); instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_START); instance.tokenBuilder.Append(' '); instance.sbLastWasWhitespace = true; } return false; }