Beispiel #1
0
        private static void Dump(ParsedHtml parsedHtml)
        {
            //Console.WriteLine("PageTitle:" + parsedHtml.Title);
            StreamWriter sw = new StreamWriter(@"D:\HtmlParser\LOG\bing" + num.ToString() + ".txt");

            num++;

            int link = 0;

            foreach (LinkItem item in parsedHtml.Links)
            {
                link++;
                //Console.WriteLine("Line:{0}\n{1}\n", link, item.ToString());
                sw.WriteLine("Line:{0}\n{1}\n", link, item.ToString());
            }
        }
        public void Process(ParsedHtml parsedHtml, bool extractSurroundingText)
        {
            Debug.Assert(parsedHtml != null);
            Debug.Assert(parsedHtml.Html != null);
            Debug.Assert(parsedHtml.Nodes != null);

            this.extractSurroundingText = extractSurroundingText;
            this.parsedHtml             = parsedHtml;
            parsedHtml.Links            = new List <LinkItem>();

            int        linkCount    = 0;
            List <int> linkPosition = new List <int>();

            for (int i = 0; i < parsedHtml.Nodes.Count; i++)
            {
                ProcessNode(i);

                if (parsedHtml.Links.Count > linkCount)
                {
                    linkPosition.Add(i);
                    linkCount++;
                }
            }

            //TODO: move the following out of HtmlParser since this is not generic
            // Extract Image Anchor like <a href=...><img src=... alt=...></a>
            Debug.Assert(parsedHtml.Links.Count == linkCount);
            for (int i = 0; i < linkCount - 1; i++)
            {
                if (linkPosition[i] + 1 == linkPosition[i + 1] &&
                    parsedHtml.Links[i].LinkSource == LinkSourceType.Anchor &&
                    string.IsNullOrEmpty(parsedHtml.Links[i].AnchorText) &&
                    parsedHtml.Links[i + 1].LinkSource == LinkSourceType.Image)
                {
                    parsedHtml.Links[i].AnchorText = "<Img_Anchor>";
                    if (!string.IsNullOrEmpty(parsedHtml.Links[i + 1].AnchorText))
                    {
                        parsedHtml.Links[i].AnchorText += parsedHtml.Links[i + 1].AnchorText;
                    }
                }
            }
        }
        /// <summary>
        /// parse a HTML
        /// </summary>
        /// <param name="html">content of HTML file</param>
        /// <returns>parsed result</returns>
        public ParsedHtml Parse(string html)
        {
            // markup parsing
            List <HtmlNode> nodes = markupParser.Parse(html);

            if (nodes == null)
            {
                return(null);
            }

            // parsing links
            ParsedHtml parsedHtml = new ParsedHtml();

            parsedHtml.Html  = html;
            parsedHtml.Nodes = nodes;


            // process parsed Html
            parsedHtml.Process();

            return(parsedHtml);
        }
Beispiel #4
0
        public static List <HtmlForm> ParseForms(List <HtmlNode> nodes)
        {
            List <HtmlForm> forms = new List <HtmlForm>();

            for (int i = 0; i < nodes.Count; i++)
            {
                if (nodes[i].IsStartTag(HtmlTagId.Form))
                {
                    int startTag = i;
                    int endTag   = ParsedHtml.FindEndTag(nodes, startTag + 1, HtmlTagId.Form);
                    if (endTag > 0)
                    {
                        HtmlForm form = HtmlForm.Parse(nodes, startTag, endTag);
                        if (form != null)
                        {
                            forms.Add(form);
                        }
                        i = endTag;
                    }
                }
            }

            return(forms);
        }
Beispiel #5
0
        public static void Process(string directory)
        {
            DirectoryInfo folder = new DirectoryInfo(directory);

            FileInfo[] files = folder.GetFiles("*.htm");
            foreach (FileInfo file in files)
            {
                string path = file.FullName;
                using (StreamReader streamReader = File.OpenText(path))
                {
                    string          html  = streamReader.ReadToEnd();
                    List <HtmlNode> nodes = markupParser.Parse(html);
                    Dump(path, html, nodes);

                    LinkExtractor linkExtractor = new LinkExtractor();

                    ParsedHtml parsedHtml = new ParsedHtml();
                    parsedHtml.Html  = html;
                    parsedHtml.Nodes = nodes;
                    linkExtractor.Process(parsedHtml, true);
                    Dump(parsedHtml);
                }
            }
        }
 public void ExtractLinks(ParsedHtml parsedHtml, bool extractSurroundingText)
 {
     linkExtractor.Process(parsedHtml, extractSurroundingText);
 }