Exemplo n.º 1
0
        public Webtext()
        {
            XmlDocument FromHtml(TextReader reader)
            {
                // ' setup SgmlReader
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.None;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = reader;
                // ' create document
                XmlDocument doc = new XmlDocument();

                doc.PreserveWhitespace = true;
                doc.XmlResolver        = null;
                doc.Load(sgmlReader);
                return(doc);
            }

            string LoadWebText(string URL)
            {
                WebClient    objWebClient = new WebClient();
                UTF8Encoding objUTF8      = new UTF8Encoding();
                XmlDocument  xml          = new XmlDocument();

                xml = FromHtml(new StringReader(objUTF8.GetString(objWebClient.DownloadData(URL))));
                return(xml.InnerText());
            }
        }
Exemplo n.º 2
0
 private static XDoc FromHtml(TextReader reader)
 {
     Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable)
     {
         Dtd                = _dtd,
         DocType            = "HTML",
         WhitespaceHandling = WhitespaceHandling.All,
         CaseFolding        = Sgml.CaseFolding.ToLower,
         InputStream        = reader
     };
     try {
         XmlDocument doc = XDoc.NewXmlDocument();
         doc.Load(sgmlReader);
         if (doc.DocumentElement == null)
         {
             return(XDoc.Empty);
         }
         if (_dtd == null)
         {
             _dtd = sgmlReader.Dtd;
         }
         return(new XDoc(doc));
     } catch (Exception) {
         return(XDoc.Empty);
     }
 }
Exemplo n.º 3
0
        public void WallStreeJournalRssTest()
        {
            using (StringReader reader = new StringReader(new WebClient().DownloadString("http://online.wsj.com/xml/rss/3_7011.xml")))
            {
                // setup the SgmlReader and load it into a XDocument
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = reader;

                string rawrssfeed = sgmlReader.ReadOuterXml();
                Console.WriteLine(rawrssfeed);
                XDocument rss = XDocument.Load(rawrssfeed);

                // Read the rss feed using linq
                var rssfeed = (from nodes in rss.Descendants()
                               select nodes);

                foreach (XElement p in rssfeed)
                {
                    switch (p.Name.LocalName)
                    {
                    default:
                        ParseItem(p);
                        break;
                    }
                }
            }
        }
Exemplo n.º 4
0
        static string GetImageFromHtml(string html)
        {
            // load the document using sgml reader
            var document = new XmlDocument();

            using (var sgmlReader = new Sgml.SgmlReader())
            {
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.None;

                using (var sr = new StringReader(html))
                {
                    sgmlReader.InputStream = sr;
                    document.Load(sgmlReader);
                }
            }

            string imageUrl = null;
            var    images   = document.GetElementsByTagName("img");

            foreach (XmlNode image in images)
            {
                if (image.Attributes["src"] != null)
                {
                    imageUrl = image.Attributes["src"].Value;
                    break;
                }
            }
            return(imageUrl);
        }
Exemplo n.º 5
0
 public Parser()
 {
     _sgmlReader                    = new Sgml.SgmlReader();
     _sgmlReader.DocType            = "HTML";
     _sgmlReader.WhitespaceHandling = System.Xml.WhitespaceHandling.All;
     _sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
 }
Exemplo n.º 6
0
        public static string RemoveScriptFromHtml(string html)
        {
            // load the document using sgml reader
            var document = new XmlDocument();

            using (var sgmlReader = new Sgml.SgmlReader())
            {
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.None;

                using (var sr = new StringReader(html))
                {
                    sgmlReader.InputStream = sr;
                    document.Load(sgmlReader);
                }
            }
            // remove <script>
            var nodes = document.GetElementsByTagName("script");

            for (int i = 0; i < nodes.Count; i++)
            {
                nodes[i].ParentNode.RemoveChild(nodes[i]);
            }

            RemoveAttributeScript(document.DocumentElement);

            return(document.OuterXml);
        }
        private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e)
        {
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = new StringReader(e.Content);

            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);

            string textOnly = doc.DocumentElement.InnerText;

            foreach (string keyword in Keywords)
            {
                MatchCollection matches = Regex.Matches(textOnly, "(?'found'" + keyword.Replace(" ", "[\\s]*") + ")", RegexOptions.IgnoreCase);
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.Write("Found ");
                Console.ForegroundColor = ConsoleColor.Cyan;
                Console.Write(keyword);
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.Write(" in ");
                Console.ForegroundColor = ConsoleColor.Cyan;
                Console.Write(matches.Count);
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.WriteLine(" different places.");
            }
        }
Exemplo n.º 8
0
 /* ----------------------------------------------------------------- */
 ///
 /// ToXDocument
 ///
 /// <summary>
 /// XDocument オブジェクトを生成します。
 /// </summary>
 ///
 /* ----------------------------------------------------------------- */
 private static XDocument ToXDocument(System.IO.Stream src)
 {
     using (var stream = new System.IO.StreamReader(src, System.Text.Encoding.UTF8))
         using (var reader = new Sgml.SgmlReader
         {
             CaseFolding = Sgml.CaseFolding.ToLower,
             DocType = "HTML",
             IgnoreDtd = true,
             InputStream = stream,
         }) return(XDocument.Load(reader));
 }
        public static XDoc WebHtml(
            [DekiScriptParam("HTML source text or source uri (default: none)", true)] string source,
            [DekiScriptParam("xpath to value (default: none)", true)] string xpath,
            [DekiScriptParam("namespaces (default: none)", true)] Hashtable namespaces,
            [DekiScriptParam("caching duration in seconds (range: 60 - 86400; default: 300)", true)] double?ttl,
            [DekiScriptParam("return nil if source could not be loaded (default: text with error message)", true)] bool?nilIfMissing
            )
        {
            string text = WebText(source, xpath, namespaces, true, ttl, nilIfMissing);

            if (text == null)
            {
                return(null);
            }

            // convert text to html without a converter
            XDoc result = XDoc.Empty;

            using (TextReader reader = new StringReader("<html><body>" + text + "</body></html>")) {
                // NOTE (steveb): we create the sgml reader explicitly since we don't want a DTD to be associated with it; the DTD would force a potentially unwanted HTML structure

                // check if HTML entities DTD has already been loaded
                if (_htmlEntitiesDtd == null)
                {
                    using (StreamReader dtdReader = new StreamReader(Plug.New("resource://mindtouch.deki.script/MindTouch.Deki.Script.Resources.HtmlEntities.dtd").Get().AsStream())) {
                        _htmlEntitiesDtd = Sgml.SgmlDtd.Parse(null, "HTML", dtdReader, null, null, XDoc.XmlNameTable);
                    }
                }

                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable);
                sgmlReader.Dtd                = _htmlEntitiesDtd;
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = reader;
                try {
                    XmlDocument doc = new XmlDocument(XDoc.XmlNameTable)
                    {
                        PreserveWhitespace = true,
                        XmlResolver        = null
                    };
                    doc.Load(sgmlReader);

                    // check if a valid document was created
                    if (doc.DocumentElement != null)
                    {
                        result = new XDoc(doc);
                    }
                } catch {
                    // swallow parsing exceptions
                }
            }
            return(CleanseHtmlDocument(result));
        }
Exemplo n.º 10
0
        public static XDocument FromHtml(TextReader reader)
        {
            // setup SgmlReader
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = reader;

            // create document
            XDocument doc = new XDocument();
            doc = XDocument.Load(sgmlReader);
            return doc;
        }
Exemplo n.º 11
0
        /// <summary>
        /// Returns an <see cref="XDocument"/> from <see cref="TextReader"/> that contains HTML.
        /// </summary>
        /// <param name="reader">The reader used for getting HTML.</param>
        /// <returns>Returns an XML representation of the HTML.</returns>
        private XDocument FromHtml(TextReader reader)
        {
            var sgmlReader = new Sgml.SgmlReader();

            sgmlReader.DocType            = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream        = reader;

            var doc = XDocument.Load(sgmlReader);

            return(doc);
        }
Exemplo n.º 12
0
        static XmlDocument FromHtml(TextReader reader)
        {
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType            = "XML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.None;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream        = reader;

            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.Load(sgmlReader);
            return(doc);
        }
        private void HandleElementStart(Sgml.SgmlReader reader)
        {
            //ghetto, but the SgmlReader has no way to get ALL attributes.
            ParserNode node = new ParserNode(reader.Name, System.Xml.XmlNodeType.Element);

            node.AddAttribute("style", reader.GetAttribute("style"));
            node.AddAttribute("title", reader.GetAttribute("title"));
            node.AddAttribute("class", reader.GetAttribute("class"));
            node.AddAttribute("href", reader.GetAttribute("href"));
            node.AddAttribute("src", reader.GetAttribute("src"));
            node.AddAttribute("colspan", reader.GetAttribute("colspan"));
            node.AddAttribute("rowspan", reader.GetAttribute("rowspan"));

            AddNode(node);
        }
Exemplo n.º 14
0
        public XDocument FromHtmlToXDoc(string webAddress)
        {
            WebClient webPage = new WebClient();
            string    html    = webPage.DownloadString(webAddress);

            using (TextReader sr = new StringReader(html))
            {
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = sr;
                return(XDocument.Load(sgmlReader));
            }
        }
Exemplo n.º 15
0
        /// <summary>
        ///
        /// </summary>
        private static XmlReader getDocReader(
            string html,
            string baseUrl)
        {
            var r = new Sgml.SgmlReader();

            if (baseUrl.Length > 0)
            {
                r.SetBaseUri(baseUrl);
            }
            r.DocType     = @"HTML";
            r.InputStream = new StringReader(html);

            return(r);
        }
Exemplo n.º 16
0
        /// <summary>
        /// Html To XMl  返回格式化好的XML文件
        /// </summary>
        /// <param name="html">传入要格式化的HTML文件</param>
        /// <returns>返回格式化好的XML文件</returns>
        public static string HTMLConvert(string html)
        {
            if (string.IsNullOrEmpty(html.Trim()))
            {
                return(string.Empty);
            }
            //solve ]]>
            //处理节点
            html = System.Text.RegularExpressions.Regex.Replace(html, @"<!\s{0,}\[\s{0,}CDATA\s{0,}\[\s{0,}|\s{0,}\]\s{0,}\]\s{0,}>", "");
            using (Sgml.SgmlReader reader = new Sgml.SgmlReader())
            {
                reader.DocType     = "HTML";
                reader.InputStream = new System.IO.StringReader(html);
                using (System.IO.StringWriter stringWriter = new System.IO.StringWriter())
                {
                    //实例化对象
                    using (System.Xml.XmlTextWriter writer = new System.Xml.XmlTextWriter(stringWriter))
                    {
                        reader.WhitespaceHandling = System.Xml.WhitespaceHandling.None;
                        writer.Formatting         = System.Xml.Formatting.Indented;
                        System.Xml.XmlDocument doc = new System.Xml.XmlDocument();
                        doc.Load(reader);
                        if (doc.DocumentElement == null)
                        {
                            return("Html to XML Error this programe can not Convert");
                        }
                        else
                        {
                            doc.DocumentElement.WriteContentTo(writer);
                        }
                        writer.Close();

                        string xhtml = stringWriter.ToString();

                        reader.InputStream.Close();
                        reader.InputStream.Dispose();

                        if (xhtml == null)
                        {
                            xhtml = stringWriter.ToString();
                            stringWriter.Close();
                        }

                        return(xhtml);
                    }
                }
            }
        }
Exemplo n.º 17
0
        static XmlDocument FromHtml(TextReader reader)
        {
            // setup SGMLReader
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = reader;

            // create document
            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);
            return doc;
        }
Exemplo n.º 18
0
        private static XmlDocument getXmlDocFromHtmlCode(string html)
        {
            XmlDocument doc = new XmlDocument();

            using (TextReader reader = new StringReader(html))
            {
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.Significant;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = reader;
                doc.Load(sgmlReader);
            }

            return(doc);
        }
Exemplo n.º 19
0
        private static XmlDocument FromHtml(TextReader reader)
        {
            // setup SGMLReader
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType            = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream        = reader;

            // create document
            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.Load(sgmlReader);
            return(doc);
        }
Exemplo n.º 20
0
        /// <summary>
        /// Load quickly a <see cref="XmlDocument"/> from a HTML string
        /// </summary>
        static public XmlDocument ParseHTMLtext(string html)
        {
            XmlDocument rslt = new XmlDocument()
            {
                PreserveWhitespace = false,
                XmlResolver        = null
            };

            using (Sgml.SgmlReader sgmlReader = CreateSgmlReader(html))
            {
                rslt.Load(sgmlReader);
            }

            rslt.RemoveDeclaration();
            rslt.SetDocumentType("html");

            return(rslt);
        }
Exemplo n.º 21
0
        public XmlDocument ParseSgml(TextReader textReader)
        {
            if (textReader == null)
                throw new ArgumentNullException ("textReader");

            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = textReader;

            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);

            return doc;
        }
Exemplo n.º 22
0
        public FlowDocument Import(string path)
        {
            // Create the flow document
            FlowDocument fd = new FlowDocument();

            fd.IsHyphenationEnabled      = true;
            fd.IsOptimalParagraphEnabled = true;
            fd.ColumnRuleWidth           = 5;
            fd.FontSize   = 12;
            fd.FontFamily = new FontFamily("Times New Roman");

            using (StreamReader reader = new StreamReader(path))
            {
                // setup the SgmlReader and load it into a XDocument
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = reader;
                XDocument xd = XDocument.Load(sgmlReader);

                // Read the html page using linq
                var htmlpage = (from nodes in xd.Descendants()
                                select nodes);

                Section s;
                foreach (XElement p in htmlpage)
                {
                    switch (p.Name.LocalName)
                    {
                    case "body":
                        s = new Section();
                        ParseBody(s.Blocks, p);
                        break;

                    case "head":
                        s = new Section();
                        ParseHead(s.Blocks, p);
                        break;
                    }
                }
            }
            return(fd);
        }
Exemplo n.º 23
0
        static string GetTextFromHtml(string html)
        {
            // load the document using sgml reader
            var document = new XmlDocument();

            using (var sgmlReader = new Sgml.SgmlReader())
            {
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.None;

                using (var sr = new StringReader(html))
                {
                    sgmlReader.InputStream = sr;
                    document.Load(sgmlReader);
                }
            }
            return(document.InnerText);
        }
        public Vehicle Parse(string markup)
        {
            var reader = new Sgml.SgmlReader {InputStream = new StringReader(markup)};
            XDocument doc = XDocument.Load(reader);
            var container = doc.Root.Descendants("div").SingleOrDefault(x => x.HasId("pnlVehicleInfo"));
            if (container == null)
                return null;
            var values = container.Elements().Where(x => x.HasClass("pairValue")).ToArray();
            if (values.Length != 4)
                throw new ApplicationException("Cannot parse markup to Vehicle, wrong number of values found");

            return new Vehicle {
                Make = values[0].Value,
                Model = values[1].Value,
                Vin = values[2].Value,
                RegistrationNumber = values[3].Value,
                Inspections = ParseInspections(doc)
            };
        }
Exemplo n.º 25
0
        private static XmlDocument getXmlDocFromHtmlCode(string html)
        {
            // Remove every xmlns info as it's a pain for our simple xpath parsing.
            html = Regex.Replace(html, "xmlns=\"[^\"]*\"", "");

            XmlDocument doc = new XmlDocument();

            using (TextReader reader = new StringReader(html))
            {
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.Significant;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = reader;
                doc.Load(sgmlReader);
            }

            return(doc);
        }
        private void LoadReader(Sgml.SgmlReader reader)
        {
            while (reader.Read())
            {
                switch (reader.NodeType)
                {
                case System.Xml.XmlNodeType.Element:
                    HandleElementStart(reader);
                    break;

                case System.Xml.XmlNodeType.EndElement:
                    HandleElementEnd(reader);
                    break;

                case System.Xml.XmlNodeType.Text:
                    HandleText(reader);
                    break;
                }
            }
        }
Exemplo n.º 27
0
        static private Sgml.SgmlReader CreateSgmlReader(string sgml)
        {
            foreach (var item in XmlHtmlEntity.HtmlBase)
            {
                sgml = item.ParseXMLtoHTML(sgml);
            }

            sgml = XmlHtmlEntity.ParseToCHAR(sgml, XmlHtmlEntity.Html2.Concat(XmlHtmlEntity.Html3, XmlHtmlEntity.Html4));
            StreamReader reader = new StreamReader(new StreamString(sgml.RemoveDOCTYPE()));

            Sgml.SgmlReader rslt = new Sgml.SgmlReader()
            {
                DocType            = "HTML",
                WhitespaceHandling = WhitespaceHandling.All,
                CaseFolding        = Sgml.CaseFolding.ToLower,
                InputStream        = reader,
            };
            reader.BaseStream.Position = 0;
            return(rslt);
        }
Exemplo n.º 28
0
        public XmlDocument ConvertHtmlToXml(string path)
        {
            // setup SgmlReader
            StreamReader reader = new StreamReader(path);

            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType            = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream        = reader;

            // create document
            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.Load(sgmlReader);

            return(doc);
        }
Exemplo n.º 29
0
        public static XmlDocument DownloadDocument(string content)
        {
            try
            {
                var doc = new XmlDocument {
                    PreserveWhitespace = true, XmlResolver = null
                };
                var i = content.IndexOf("<rss", System.StringComparison.Ordinal);
                if (i == -1)
                {
                    using (var xhtmlConverter = new Sgml.SgmlReader())
                    {
                        xhtmlConverter.DocType            = "HTML";
                        xhtmlConverter.WhitespaceHandling = WhitespaceHandling.All;
                        xhtmlConverter.CaseFolding        = Sgml.CaseFolding.ToLower;
                        xhtmlConverter.InputStream        = new System.IO.StringReader(content);
                        doc.Load(xhtmlConverter);
                        xhtmlConverter.Close();
                    }
                }
                else
                {
                    content = content.Substring(i);
                    doc.LoadXml(content);
                }

                return(doc);
            }
            catch (OutOfMemoryException ex)
            {
                throw;
            }
            catch (WebException ex)
            {
                throw;
            }
            catch (Exception ex)
            {
                throw;
            }
        }
Exemplo n.º 30
0
        /// <summary>
        /// Распарсить данные со страницы ответа сервера.
        /// </summary>
        /// <param name="reader"></param>
        /// <returns></returns>
        private XmlDocument FromHtml(TextReader reader)
        {
            // setup SgmlReader
            var sgmlReader = new Sgml.SgmlReader
            {
                DocType            = "HTML",
                WhitespaceHandling = WhitespaceHandling.All,
                CaseFolding        = Sgml.CaseFolding.ToLower,
                InputStream        = reader
            };

            // create document
            var doc = new XmlDocument
            {
                PreserveWhitespace = true,
                XmlResolver        = null
            };

            doc.Load(sgmlReader);
            return(doc);
        }
Exemplo n.º 31
0
        private String GetProductDescription(String Url)
        {
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType            = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;

            sgmlReader.InputStream = FetchHtmlDoc(Url);

            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.Load(sgmlReader);

            XmlNodeList Pnodes = doc.GetElementsByTagName("p");

            String Description = Pnodes[0].InnerText;

            return(Description);
        }
Exemplo n.º 32
0
        public XmlDocument ParseSgml(TextReader textReader)
        {
            if (textReader == null)
            {
                throw new ArgumentNullException("textReader");
            }

            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType            = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream        = textReader;

            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.Load(sgmlReader);

            return(doc);
        }
Exemplo n.º 33
0
        private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e)
        {
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = new StringReader(e.Content);

            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);

            XmlNodeList list = doc.SelectNodes(@"/html/body[@id='gsr']/div[@id='res']/div/ol/li/div/cite");
            int count = 0;

            foreach (XmlNode node in list)
            {
                count++;
                string foundUrl = node.InnerText;
                foundUrl = GoogleUrlResultSize.Replace(foundUrl, String.Empty);
                foundUrl = "http://" + foundUrl;
                Uri url;

                if (Uri.TryCreate(foundUrl, UriKind.Absolute, out url))
                {
                    if (url.Host.IndexOf(LookingForDomain) >= 0)
                    {
                        string result = String.Format("Rank {0} for {1}", count + CurrentlyProcessingStart, url);
                        ResultsFound.Add(result);

                        Console.ForegroundColor = ConsoleColor.Cyan;
                        Console.WriteLine(result);
                        Console.ResetColor();
                        Console.Beep();
                    }
                }
            }
        }
Exemplo n.º 34
0
        public bool Read(Book book, Stream stream)
        {
            var mem = new MemoryStream();

            stream.CopyTo(mem);
            var encoding = Util.DetectXmlEncoding(mem);

            logger.Trace($"Book encoding detected, id:{book.Id}, enc:{encoding}");

            using (var reader = new StreamReader(mem, encoding))
            {
                using (var sgmlReader = new Sgml.SgmlReader())
                {
                    sgmlReader.InputStream = reader;
                    var doc = XDocument.Load(sgmlReader);
                    logger.Trace($"Book file loaded, id:{book.Id}");

                    try
                    {
                        UpdateAnnotation(book, doc);
                    }
                    catch (Exception)
                    {
                    }

                    try
                    {
                        UpdateCover(book, doc);
                    }
                    catch (Exception)
                    {
                    }
                }
            }

            return(true);
        }
Exemplo n.º 35
0
        public XDoc WebHtml(
            [DekiExtParam("HTML source text or source uri (default: none)", true)] string source,
            [DekiExtParam("xpath to value (default: none)", true)] string xpath,
            [DekiExtParam("namespaces (default: none)", true)] Hashtable namespaces,
           [DekiExtParam("caching duration in seconds (range: 300+; default: 300)", true)] double? ttl
        ) {
            string text = WebText(source, xpath, namespaces, true, ttl);

            // convert text to html
            XDoc result = XDoc.Empty;
            using(TextReader reader = new StringReader("<html><body>" + text + "</body></html>")) {

                // NOTE (steveb): we create the sgml reader explicitly since we don't want a DTD to be associated with it; the DTD would force a potentially unwanted HTML structure

                // check if HTML entities DTD has already been loaded
                if(_htmlEntitiesDtd == null) {
                    using(StreamReader dtdReader = new StreamReader(Plug.New("resource://mindtouch.deki.script/MindTouch.Deki.Script.HtmlEntities.dtd").Get().AsStream())) {
                        _htmlEntitiesDtd = Sgml.SgmlDtd.Parse(null, "HTML", dtdReader, null, null, XDoc.XmlNameTable);
                    }
                }

                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable);
                sgmlReader.Dtd = _htmlEntitiesDtd;
                sgmlReader.DocType = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream = reader;
                try {
                    XmlDocument doc = new XmlDocument(XDoc.XmlNameTable);
                    doc.PreserveWhitespace = true;
                    doc.XmlResolver = null;
                    doc.Load(sgmlReader);

                    // check if a valid document was created
                    if(doc.DocumentElement != null) {
                        result = new XDoc(doc);
                    }
                } catch(Exception) {

                    // swallow parsing exceptions
                }
            }
            return DekiScriptLibrary.CleanseHtmlDocument(result);
        }
Exemplo n.º 36
0
 private static XDoc FromHtml(TextReader reader)
 {
     Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable) {
         Dtd = _dtd,
         DocType = "HTML",
         WhitespaceHandling = WhitespaceHandling.All,
         CaseFolding = Sgml.CaseFolding.ToLower,
         InputStream = reader
     };
     try {
         XmlDocument doc = XDoc.NewXmlDocument();
         doc.Load(sgmlReader);
         if(doc.DocumentElement == null) {
             return XDoc.Empty;
         }
         if(_dtd == null) {
             _dtd = sgmlReader.Dtd;
         }
         return new XDoc(doc);
     } catch(Exception) {
         return XDoc.Empty;
     }
 }
        /// <summary>
        /// Returns an <see cref="XDocument"/> from <see cref="TextReader"/> that contains HTML.
        /// </summary>
        /// <param name="reader">The reader used for getting HTML.</param>
        /// <returns>Returns an XML representation of the HTML.</returns>
        private XDocument FromHtml(TextReader reader)
        {
            var sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = reader;

            var doc = XDocument.Load(sgmlReader);
            return doc;
        }
Exemplo n.º 38
0
        private static XmlDocument DOMTreeToXml(HtmlDocument htmlDoc)
        {
            XmlDocument result = new XmlDocument();
            if(htmlDoc != null &&
               htmlDoc.Body != null &&
               htmlDoc.Body.Parent != null)
            {
              	    HtmlElement topHtml = htmlDoc.Body.Parent;
              	    using (StringReader sReader = new StringReader(topHtml.OuterHtml))
              	    {
              	      using (StringWriter errorLog = new StringWriter())
              	      {
              	        Sgml.SgmlReader reader = new Sgml.SgmlReader();
              	        reader.ErrorLog = errorLog;
              	        reader.InputStream = sReader;
              	        using (StringReader dtdReader = new StringReader(Properties.Resources.WeakHtml))
              	          reader.Dtd = Sgml.SgmlDtd.Parse(null, "HTML", null, dtdReader, null, null, reader.NameTable);

              	        result.Load(reader);
              	        errorLog.Flush();
              	        Console.WriteLine(errorLog.ToString());
              	      }
              	    }
            }
              	  return result;
        }
Exemplo n.º 39
0
        /// <remarks>コメントはJavaScriptによって動的に読み込まれているので、この方法では取得できない</remarks>
        private static IEnumerable <Comment> RetrieveComments(string entryUrl)
        {
            var doc = new XmlDocument();

            Console.Error.Write("{0} のコメントを取得中 ... ", entryUrl);

            using (var sgmlReader = new Sgml.SgmlReader()) {
                sgmlReader.Href        = entryUrl;
                sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;

                doc.Load(sgmlReader);

                System.Threading.Thread.Sleep(500);
            }

            //var contentNode = doc.GetElementById("content");
            var contentNode = doc.SelectSingleNode("//*[@id = 'content']");

            foreach (XmlElement commentRootElement in contentNode.SelectNodes(".//*[contains(@class, 'entry-comment')]"))
            {
                var comment = new Comment();

                foreach (XmlNode commentChildNode in commentRootElement.ChildNodes)
                {
                    if (commentChildNode.NodeType != XmlNodeType.Element)
                    {
                        continue;
                    }

                    var commentChildElement = (XmlElement)commentChildNode;

                    switch (commentChildElement.GetAttribute("class"))
                    {
                    case "comment-user-name":
                        /*
                         * <!-- hatena user -->
                         * <e class="comment-user-name">
                         *   <a class="comment-user-id" href="http://blog.hatena.ne.jp/hatenaid/">
                         *     <span class="comment-nickname" data-user-name="hatenaid">
                         *       id:hatenaid
                         *     </span>
                         *   </a>
                         * </e>
                         * <!-- name with website -->
                         * <e class="comment-user-name">
                         *   name
                         *   <a class="icon-website" href="http://example.com/" />
                         * </e>
                         * <!-- name only -->
                         * <e class="comment-user-name">
                         *   name
                         * </e>
                         */
                        comment.Author = commentChildElement.InnerText.Trim();
                        comment.Url    = commentChildElement.GetSingleNodeValueOf(".//@href");
                        break;

                    case "comment-content":
                        /*
                         * <e class="comment-content">
                         *   <p>comment-html</p>
                         * </e>
                         */
                        comment.Content = commentChildElement.FirstChild.InnerXml;
                        break;

                    case "comment-metadata":
                        /*
                         * <e class="comment-metadata">
                         *   <time data-epoch="1387283661000" />
                         * </e>
                         */
                        comment.Date = DateTimeOffset.FromUnixTime(commentChildElement.GetSingleNodeValueOf("time/@data-epoch", long.Parse) / 1000).ToLocalTime();
                        break;
                    }
                }

                yield return(comment);
            }

            Console.Error.WriteLine("完了");
        }
Exemplo n.º 40
0
        public SlurpResult Process(string manufacturerId)
        {
            using (WebClient client = new WebClient())
            {
                SlurpResult result = new SlurpResult(manufacturerId);
                result.SiteName = this.GetType().Name;

                var tweakerUrl = String.Format("http://tweakers.net/pricewatch/zoeken/?keyword={0}", manufacturerId);

                string searchResults = client.DownloadString(tweakerUrl);

                if (searchResults.Contains("Er werden geen producten gevonden."))
                {
                    return(result);
                }

                log.InfoFormat("Processing product {0}", manufacturerId);

                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(searchResults);

                var productUrl = doc.DocumentNode.SelectSingleNode("//table[@class=\"priceTable\"]/tbody/tr[1]/td[2]/p/a").Attributes["href"].Value;

                string productPage = client.DownloadString(productUrl);

                if (productPage.Contains("Van dit product worden geen prijzen meer getoond.") || productPage.Contains("Geen actuele prijzen bekend."))
                {
                    result.ProductStatus = ProductStatus.Obsolete;
                    return(result);
                }

                using (var reader = new StringReader(productPage))
                {
                    XDocument xdoc = new XDocument();
                    using (Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader())
                    {
                        sgmlReader.DocType            = "HTML";
                        sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                        sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                        sgmlReader.InputStream        = reader;

                        xdoc = XDocument.Load(sgmlReader);
                    }

                    var rows = xdoc.Root.Descendants("table").Where(t => t.Attribute("class").Value == "priceTable").First().Element("tbody").Elements("tr");


                    foreach (var row in rows)
                    {
                        var cells = row.Elements("td");
                        if (cells.First().Attribute("colspan") != null && Int32.Parse(cells.First().Attribute("colspan").Value) > 1)
                        {
                            continue;
                        }

                        var    shop = cells.First().Value;
                        string txt;
                        var    priceCell = cells.FirstOrDefault(x => x.Attribute("class") != null && x.Attribute("class").Value == "price");

                        decimal?price = null;
                        if (priceCell != null)
                        {
                            price = StripPrice2(priceCell.Value);
                        }


                        var totalPriceCell = cells.FirstOrDefault(x => x.Attribute("class") != null && x.Attribute("class").Value == "totalPrice");
                        if (totalPriceCell != null && totalPriceCell.Element("div") != null)
                        {
                            txt = totalPriceCell.Element("div").Element("a").Value;
                        }
                        else
                        {
                            txt = null;
                        }


                        var total = StripPrice2(txt);

                        var    deliveryCell = cells.First(x => x.Attribute("class") != null && x.Attribute("class").Value == "delivery");
                        var    img          = deliveryCell.Element("img");
                        string delivery     = String.Empty;

                        if (img != null)
                        {
                            delivery = img.Attribute("src").Value;
                        }
                        else
                        {
                            delivery = String.Empty;
                        }
                        //delivery = deliveryCell.Element("span").Attribute("title").Value;

                        var deliveryStatus = ParseIcon(delivery);

                        result.Shops.Add(new ShopResult()
                        {
                            Name       = shop,
                            Price      = price,
                            TotalPrice = total,
                            Delivery   = deliveryStatus
                        });
                    }
                }

                return(result);
            }
        }