示例#1
0
            public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
            {
                string sizeAttr = atts ["size"].Value;

                if (sizeAttr != null)
                {
                    Matcher m = CommonTagActions.PAT_FONT_SIZE.Matcher(sizeAttr);
                    if (m.Matches())
                    {
                        string rel = m.Group(1);
                        int    val = System.Convert.ToInt32(m.Group(2));
                        int    size;
                        if (rel.Length == 0)
                        {
                            // absolute
                            size = val;
                        }
                        else
                        {
                            // relative
                            int?prevSize;
                            if (instance.fontSizeStack.IsEmpty())
                            {
                                prevSize = 3;
                            }
                            else
                            {
                                prevSize = 3;
                                foreach (int?s in instance.fontSizeStack)
                                {
                                    if (s != null)
                                    {
                                        prevSize = s;
                                        break;
                                    }
                                }
                            }
                            if (rel[0] == '+')
                            {
                                size = (int)prevSize + val;
                            }
                            else
                            {
                                size = (int)prevSize - val;
                            }
                        }
                        instance.fontSizeStack.Add(0, size);
                    }
                    else
                    {
                        instance.fontSizeStack.Add(0, null);
                    }
                }
                else
                {
                    instance.fontSizeStack.Add(0, null);
                }
                return(false);
            }
示例#2
0
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     if (--instance.inAnchor == 0)
     {
         if (instance.inIgnorableElement == 0)
         {
             instance.AddWhitespaceIfNecessary();
             instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_END);
             instance.tokenBuilder.Append(' ');
             instance.sbLastWasWhitespace = true;
         }
     }
     return(false);
 }
示例#3
0
        /// <exception cref="Sharpen.SAXException"></exception>
        public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
        {
            IList <string> labels = new AList <string>(5);

            labels.AddItem(DefaultLabels.MARKUP_PREFIX + localName);
            string classVal = atts["class"].Value;

            if (classVal != null && classVal.Length > 0)
            {
                classVal = PAT_NUM.Matcher(classVal).ReplaceAll("#");
                classVal = classVal.Trim();
                string[] vals = classVal.Split("[ ]+");
                labels.AddItem(DefaultLabels.MARKUP_PREFIX + "." + classVal.Replace(' ', '.'));
                if (vals.Length > 1)
                {
                    foreach (string s in vals)
                    {
                        labels.AddItem(DefaultLabels.MARKUP_PREFIX + "." + s);
                    }
                }
            }
            var att = atts["id"];
            var id  = (atts != null) ? att.Name : "";

            if (id != null && id.Length > 0)
            {
                id = PAT_NUM.Matcher(id).ReplaceAll("#");
                labels.AddItem(DefaultLabels.MARKUP_PREFIX + "#" + id);
            }
            ICollection <string> ancestors           = GetAncestorLabels();
            IList <string>       labelsWithAncestors = new AList <string>((ancestors.Count + 1) * labels
                                                                          .Count);

            foreach (string l in labels)
            {
                foreach (string an in ancestors)
                {
                    labelsWithAncestors.AddItem(an);
                    labelsWithAncestors.AddItem(an + " " + l);
                }
                labelsWithAncestors.AddItem(l);
            }
            instance.AddLabelAction(new LabelAction(Sharpen.Collections.ToArray(labelsWithAncestors
                                                                                , new string[labelsWithAncestors.Count])));
            labelStack.AddItem(labels);
            return(isBlockLevel);
        }
示例#4
0
            /// <exception cref="Sharpen.SAXException"></exception>
            public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
            {
                if (instance.inAnchor++ > 0)
                {
                    // as nested A elements are not allowed per specification, we
                    // are probably reaching this branch due to a bug in the XML
                    // parser

                    this.End(instance, localName);
                }
                if (instance.inIgnorableElement == 0)
                {
                    instance.AddWhitespaceIfNecessary();
                    instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_START);
                    instance.tokenBuilder.Append(' ');
                    instance.sbLastWasWhitespace = true;
                }
                return(false);
            }
示例#5
0
 /// <exception cref="Sharpen.SAXException"></exception>
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     if (instance.inAnchor++ > 0)
     {
         // as nested A elements are not allowed per specification, we
         // are probably reaching this branch due to a bug in the XML
         // parser
         System.Console.Error.WriteLine("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
                                        );
         this.End(instance, localName);
     }
     if (instance.inIgnorableElement == 0)
     {
         instance.AddWhitespaceIfNecessary();
         instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_START);
         instance.tokenBuilder.Append(' ');
         instance.sbLastWasWhitespace = true;
     }
     return(false);
 }
示例#6
0
 /// <exception cref="Sharpen.SAXException"></exception>
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     IList<string> labels = new AList<string> (5);
     labels.AddItem (DefaultLabels.MARKUP_PREFIX + localName);
     string classVal = atts ["class"].Value;
     if (classVal != null && classVal.Length > 0) {
         classVal = PAT_NUM.Matcher (classVal).ReplaceAll ("#");
         classVal = classVal.Trim ();
         string[] vals = classVal.Split ("[ ]+");
         labels.AddItem (DefaultLabels.MARKUP_PREFIX + "." + classVal.Replace (' ', '.'));
         if (vals.Length > 1) {
             foreach (string s in vals) {
                 labels.AddItem (DefaultLabels.MARKUP_PREFIX + "." + s);
             }
         }
     }
     var att = atts["id"];
     var id =  ( atts !=null) ? att.Name : "";
     if (id != null && id.Length > 0) {
         id = PAT_NUM.Matcher (id).ReplaceAll ("#");
         labels.AddItem (DefaultLabels.MARKUP_PREFIX + "#" + id);
     }
     ICollection<string> ancestors = GetAncestorLabels ();
     IList<string> labelsWithAncestors = new AList<string> ((ancestors.Count + 1) * labels
         .Count);
     foreach (string l in labels) {
         foreach (string an in ancestors) {
             labelsWithAncestors.AddItem (an);
             labelsWithAncestors.AddItem (an + " " + l);
         }
         labelsWithAncestors.AddItem (l);
     }
     instance.AddLabelAction (new LabelAction (Sharpen.Collections.ToArray (labelsWithAncestors
         , new string[labelsWithAncestors.Count])));
     labelStack.AddItem (labels);
     return isBlockLevel;
 }
 public NBoilerpipeHtmlParser(NBoilerpipeContentHandler contentHandler)
 {
     this.contentHandler = contentHandler;
 }
示例#8
0
			public bool Start(NBoilerpipeContentHandler instance, string localName,HtmlAttributeCollection atts)
			{
				instance.inIgnorableElement++;
				return true;
			}
示例#9
0
			public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				instance.AddLabelAction(action);
				return true;
			}
示例#10
0
			public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				instance.AddWhitespaceIfNecessary();
				instance.AddLabelAction(action);
				return false;
			}
示例#11
0
			public bool End(NBoilerpipeContentHandler instance, string localName)
			{
				instance.fontSizeStack.RemoveAt(0);
				return false;
			}
示例#12
0
			public bool Start (NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				string sizeAttr = atts ["size"].Value;
				if (sizeAttr != null)
				{
					var m = CommonTagActions.PAT_FONT_SIZE.Matcher(sizeAttr);
					if (m.Matches())
					{
						string rel = m.Group(1);
						int val = System.Convert.ToInt32(m.Group(2));
						int size;
						if (rel.Length == 0)
						{
							// absolute
							size = val;
						}
						else
						{
							// relative
							int? prevSize;
							if (instance.fontSizeStack.Count == 0)
							{
								prevSize = 3;
							}
							else
							{
								prevSize = 3;
								foreach (int? s in instance.fontSizeStack)
								{
									if (s != null)
									{
										prevSize = s;
										break;
									}
								}
							}
							if (rel[0] == '+')
							{
								size = (int)prevSize + val;
							}
							else
							{
								size = (int)prevSize - val;
							}
						}
						instance.fontSizeStack.Insert(0, size);
					}
					else
					{
                        instance.fontSizeStack.Insert(0, null);
					}
				}
				else
				{
                    instance.fontSizeStack.Insert(0, null);
				}
				return false;
			}
示例#13
0
			public bool End(NBoilerpipeContentHandler instance, string localName)
			{
				instance.FlushBlock();
				instance.inBody--;
				return false;
			}
示例#14
0
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     instance.AddLabelAction(action);
     return(true);
 }
示例#15
0
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     instance.AddWhitespaceIfNecessary();
     return(false);
 }
示例#16
0
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     instance.AddWhitespaceIfNecessary();
     instance.AddLabelAction(action);
     return(false);
 }
示例#17
0
 /// <exception cref="Sharpen.SAXException"></exception>
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     return(t1.Start(instance, localName, atts) | t2.Start(instance, localName, atts));
 }
示例#18
0
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     instance.fontSizeStack.RemoveFirst();
     return(false);
 }
示例#19
0
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     return(false);
 }
示例#20
0
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     instance.FlushBlock();
     instance.inBody--;
     return(false);
 }
示例#21
0
			public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				instance.FlushBlock();
				instance.inBody++;
				return false;
			}
示例#22
0
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     return(true);
 }
示例#23
0
			public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				return false;
			}
示例#24
0
 /// <exception cref="Sharpen.SAXException"></exception>
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     return(t1.End(instance, localName) | t2.End(instance, localName));
 }
示例#25
0
			/// <exception cref="Sharpen.SAXException"></exception>
			public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				return t1.Start(instance, localName, atts) | t2.Start(instance, localName,atts);
			}
示例#26
0
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     instance.inIgnorableElement++;
     return(true);
 }
示例#27
0
			/// <exception cref="Sharpen.SAXException"></exception>
			public bool End(NBoilerpipeContentHandler instance, string localName)
			{
				return t1.End(instance, localName) | t2.End(instance, localName);
			}
示例#28
0
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     instance.inIgnorableElement--;
     return(true);
 }
示例#29
0
			public bool End(NBoilerpipeContentHandler instance, string localName)
			{
				instance.AddWhitespaceIfNecessary();
				return false;
			}
示例#30
0
 /// <exception cref="Sharpen.SAXException"></exception>
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     labelStack.RemoveLast();
     return(isBlockLevel);
 }
示例#31
0
			public bool End(NBoilerpipeContentHandler instance, string localName)
			{
				return true;
			}
示例#32
0
 /// <exception cref="Sharpen.SAXException"></exception>
 public bool End(NBoilerpipeContentHandler instance, string localName)
 {
     labelStack.RemoveLast();
     return isBlockLevel;
 }
示例#33
0
			public bool End(NBoilerpipeContentHandler instance, string localName	)
			{
				instance.inIgnorableElement--;
				return true;
			}
示例#34
0
			public bool End (NBoilerpipeContentHandler instance, string localName)
			{
                if (--instance.inAnchor == 0)
                {
                    if (instance.inIgnorableElement == 0)
                    {
                        instance.AddWhitespaceIfNecessary();
                        instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_END);
                        instance.tokenBuilder.Append(' ');
                        instance.sbLastWasWhitespace = true;
                    }
                }
                else
                    instance.inIgnorableElement--;
				return false;
			}
示例#35
0
            public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
            {
                try
                {
                    var alt = atts.Contains("alt") ? atts["alt"].Value : "";
                    if (alt.Length < 5)
                    {
                        alt = (atts.Contains("title") ? atts["title"].Value : alt);
                    }

                    int width = Math.Max(atts.Contains("width") ? int.Parse(atts["width"].Value.TrimEnd('p', 'x', ';')) : 0, 1);
                    int height = Math.Max(atts.Contains("height") ? int.Parse(atts["height"].Value.TrimEnd('p', 'x', ';')) : 0, 1);
                    var src = atts.Contains("src") ? atts["src"].Value : FindAlternateSrc(atts);
                    bool isWikimedia = false;
                    if (instance.inIgnorableElement <= 0 && !string.IsNullOrWhiteSpace(src) &&
                        (alt.Length > 5 || width > 400 || height > 320 || (isWikimedia = src.StartsWith("//upload.wikimedia.org"))))
                    {
                        var altWidthHeight = FindAlternateWidthHieght(src);
                        width = Math.Max(altWidthHeight.Item1, width);
                        height = Math.Max(altWidthHeight.Item2, height);

                        if (src.StartsWith("//"))
                            src = "http:" + src;

                        if (width > 400 || height > 320 || isWikimedia)
                        {
                            var tb = new Document.TextBlock("", new Sharpen.BitSet(), Math.Max((Math.Max(width, height) / 6), alt.Length), 0, 0, 0, 0, src);
                            tb.SetIsContent(true);
                            instance.textBlocks.Add(tb);
                        }
                    }
                    instance.inIgnorableElement++;
                    return true;
                }
                catch(Exception ex)
                {
                    Debug.WriteLine("during boilerpipe parsing: " + ex.ToString());
                }
                instance.inIgnorableElement++;
                return true;
            }
示例#36
0
 public bool Start(NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
 {
     instance.FlushBlock();
     instance.inBody++;
     return(false);
 }
示例#37
0
 public NBoilerpipeHtmlParser(NBoilerpipeContentHandler contentHandler)
 {
     this.contentHandler = contentHandler;
 }
示例#38
0
			/// <exception cref="Sharpen.SAXException"></exception>
			public bool Start (NBoilerpipeContentHandler instance, string localName, HtmlAttributeCollection atts)
			{
				if (instance.inAnchor++ > 0) {
					// as nested A elements are not allowed per specification, we
					// are probably reaching this branch due to a bug in the XML
					// parser
					//System.Console.Error.WriteLine ("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
						//);
					//this.End (instance, localName);
                    instance.inIgnorableElement++;
				}
				if (instance.inIgnorableElement == 0) {
					instance.AddWhitespaceIfNecessary ();
					instance.tokenBuilder.Append(NBoilerpipeContentHandler.ANCHOR_TEXT_START);
					instance.tokenBuilder.Append(' ');
					instance.sbLastWasWhitespace = true;
				}
				return false;
			}