This is a simple API for the parsing process. Part of this is a port of the nu.validator.htmlparser.io.Driver class. The parser currently ignores the encoding in the html source and parses everything as UTF-8.
Beispiel #1
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="fs"></param>
        private void ParseLinks(FileStream fs, Response response)
        {
            try
            {
                if (File.Exists(response.TempFile) == false)
                {
                    return;
                }

                // Lets ensure that the first non-blank line has a HTML header!
                using (FileStream temp = System.IO.File.OpenRead(response.TempFile))
                {
                    using (LineReader lr = new LineReader(temp, 4096, Encoding.Default))
                    {
                        bool process = true;
                        while (process == true)
                        {
                            string line = lr.ReadLine();
                            if (line == null)
                            {
                                return;
                            }

                            line = line.Trim();

                            if (line.Length == 0)
                            {
                                continue;
                            }

                            bool validHtml = true;
                            if (line.IndexOf("<html>", StringComparison.InvariantCultureIgnoreCase) == -1)
                            {
                                if (line.IndexOf("<!doctype html", StringComparison.InvariantCultureIgnoreCase) == -1)
                                {
                                    validHtml = false;
                                }
                            }

                            if (validHtml == true)
                            {
                                break;
                            }
                            else
                            {
                                return;
                            }
                        }
                    }
                }

                using (TextReader tr = File.OpenText(response.TempFile))
                {
                    SimpleHtmlParser parser = new SimpleHtmlParser();
                    var doc = parser.Parse(tr);

                    List<string> links = new List<string>();
                    foreach (System.Xml.XmlElement link in doc.GetElementsByTagName("a"))
                    {
                        if (link.Attributes == null)
                        {
                            continue;
                        }

                        if (link.Attributes["href"] == null)
                        {
                            continue;
                        }

                        var href = link.Attributes["href"].Value.Trim();

                        string md5 = Text.ConvertByteArrayToHexString(Security.GenerateMd5Hash(href));
                        if (md5.ToLower() == "6666cd76f96956469e7be39d750cc7d9")
                        {
                            // Ignore "/"
                            continue;
                        }

                        if (links.Contains(md5) == false)
                        {
                            links.Add(md5);
                            woanware.IO.WriteToFileStream(fs, "LINK: " + href + Environment.NewLine);
                        }
                    }
                }
            }
            catch (Exception) { }
        }