// Parse the document from the current position until we find the
        // matching closing tag
        private SnapshotSpan?FindClosingTag(ITextSnapshot snapshot, int searchStart, string searchFor)
        {
            String textToSearch = snapshot.GetText(searchStart, snapshot.Length - searchStart);

            using (SgmlReader reader = new SgmlReader()) {
                reader.InputStream        = new StringReader(textToSearch);
                reader.WhitespaceHandling = WhitespaceHandling.All;
                try {
                    reader.Read();
                    if (!reader.IsEmptyElement)
                    {
                        // skip all the internal nodes, until the end
                        while (reader.Read())
                        {
                            if (reader.NodeType == XmlNodeType.EndElement && reader.Depth == 1)
                            {
                                break;
                            }
                        }
                        // calculate the new position based on the number of lines
                        // read in the SgmlReader + the position within that line.
                        // Note that if there is whitespace after the closing tag
                        // we'll be positioned on it, so we need to keep track of that.
                        var origLine    = snapshot.GetLineFromPosition(searchStart);
                        int startOffset = searchStart - origLine.Start.Position;
                        int newStart    = 0;
                        // tag is on same position as the opening one
                        if (reader.LineNumber == 1)
                        {
                            var line = snapshot.GetLineFromPosition(searchStart);
                            newStart = line.Start.Position + startOffset + reader.LinePosition - 2;
                        }
                        else
                        {
                            int newLineNum = origLine.LineNumber + reader.LineNumber - 1;
                            var newLine    = snapshot.GetLineFromLineNumber(newLineNum);
                            newStart = newLine.Start.Position + reader.LinePosition - 1;
                        }
                        newStart -= reader.Name.Length + 3; // </ + element + >

                        SnapshotSpan?newSpan = new SnapshotSpan(snapshot, newStart, searchFor.Length);
                        if (newSpan.Value.GetText() != searchFor)
                        {
                            Trace.WriteLine(String.Format("Searching for '{0}', but found '{1}'.", searchFor, newSpan.Value.GetText()));
                            newSpan = null;
                        }
                        return(newSpan);
                    }
                } catch (Exception ex) {
                    Trace.WriteLine(String.Format("Exception while parsing document: {0}.", ex.ToString()));
                }
            }
            return(null);
        }
Beispiel #2
0
        // Creates XmlDocument from html content and return it with rootitem "<root>".
        public static XmlDocument ParseHtml(string sContent)
        {
            StringReader sr     = new StringReader("<root>" + sContent + "</root>");
            SgmlReader   reader = new SgmlReader();

            reader.WhitespaceHandling = WhitespaceHandling.All;
            reader.CaseFolding        = Sgml.CaseFolding.ToLower;
            reader.InputStream        = sr;

            StringWriter  sw = new StringWriter();
            XmlTextWriter w  = new XmlTextWriter(sw);

            w.Formatting = Formatting.Indented;
            w.WriteStartDocument();
            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();

            sw.Flush();

            // create document
            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.LoadXml(sw.ToString());

            reader.Close();

            return(doc);
        }
Beispiel #3
0
        void Process(SgmlReader reader, string uri)
        {
            if (uri == null)
            {
                reader.InputStream = Console.In;
            }
            else
            {
                reader.Href = uri;
            }


            this.encoding ??= reader.GetEncoding();

            XmlTextWriter w = output != null
                ? new XmlTextWriter(output, this.encoding)
                : new XmlTextWriter(Console.Out);

            if (formatted)
            {
                w.Formatting = Formatting.Indented;
            }
            if (!noxmldecl)
            {
                w.WriteStartDocument();
            }
            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();
        }
        public static string GetWellFormedHTML(string html, string xpathNavPath)
        {
            // StreamReader sReader = null;
            StringWriter  sw     = null;
            SgmlReader    reader = null;
            XmlTextWriter writer = null;

            try
            {
                //  if (uri == String.Empty) uri = "http://www.XMLforASP.NET";
                // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
                //  HttpWebResponse res = (HttpWebResponse)req.GetResponse();
                //  sReader = new StreamReader(res.GetResponseStream());
                reader             = new SgmlReader();
                reader.DocType     = "HTML";
                reader.InputStream = new StringReader(html);
                sw                = new StringWriter();
                writer            = new XmlTextWriter(sw);
                writer.Formatting = Formatting.Indented;
                //writer.WriteStartElement("Test");
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        writer.WriteNode(reader, true);
                    }
                }
                //writer.WriteEndElement();
                if (xpathNavPath == null)
                {
                    string sr = sw.ToString();
                    sr = sr.Replace("\r", "\n");
                    sr = sr.Replace("\n\n", "\n");
                    return(sr);
                }
                else
                { //Filter out nodes from HTML
                    StringBuilder     sb    = new StringBuilder();
                    XPathDocument     doc   = new XPathDocument(new StringReader(sw.ToString()));
                    XPathNavigator    nav   = doc.CreateNavigator();
                    XPathNodeIterator nodes = nav.Select(xpathNavPath);
                    while (nodes.MoveNext())
                    {
                        sb.Append(nodes.Current.Value + "\n");
                    }
                    string sr = sb.ToString();
                    sr = sr.Replace("\r", "\n");
                    sr = sr.Replace("\n\n", "\n");
                    return(sr);
                }
            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                // sReader.Close();
                return(exp.Message);
            }
        }
Beispiel #5
0
        /// <summary>
        /// 将html代码转换为xml代码,需要在try-catch块中调用。
        /// </summary>
        public static string HtmlToXml(string html)
        {
            if (string.IsNullOrWhiteSpace(html))
            {
                return(string.Empty);
            }

            html = StringUtils.ReplaceIgnoreCase(html, "<br>", "<br />");
            html = StringUtils.ReplaceIgnoreCase(html, "&#", "&amp;#");
            html = html.Replace(" @", " hexadecimal-value-0x40"); //vuejs shorthand @click
            html = html.Replace(" :", " hexadecimal-value-0x3a"); //vuejs shorthand :href
            //strInputHtml = StringUtils.ReplaceNewline(strInputHtml, NEWLINE_REPLACEMENT);
            var reader = new SgmlReader
            {
                DocType = "HTML"
            };
            var sr = new System.IO.StringReader(html);

            reader.InputStream = sr;
            var sw = new System.IO.StringWriter();
            var w  = new XmlTextWriter(sw);

            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }

            w.Flush();
            w.Close();
            var xml = sw.ToString();

            //xml = xml.Replace(NEWLINE_REPLACEMENT, "\r\n");
            return(xml);
        }
Beispiel #6
0
        /// <summary>
        /// 将html代码转换为xml代码,需要在try-catch块中调用。
        /// </summary>
        public static string HtmlToXml(string strInputHtml)
        {
            strInputHtml = StringUtils.ReplaceIgnoreCase(strInputHtml, "<br>", "<br />");
            strInputHtml = StringUtils.ReplaceIgnoreCase(strInputHtml, "&#", "&amp;#");
            //strInputHtml = StringUtils.ReplaceNewline(strInputHtml, NEWLINE_REPLACEMENT);
            var reader = new SgmlReader
            {
                DocType = "HTML"
            };
            var sr = new System.IO.StringReader(strInputHtml);

            reader.InputStream = sr;
            var sw = new System.IO.StringWriter();
            var w  = new XmlTextWriter(sw);

            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();
            var xml = sw.ToString();

            //xml = xml.Replace(NEWLINE_REPLACEMENT, "\r\n");
            return(xml);
        }
Beispiel #7
0
        private string ProcessString(string strInputHtml)
        {
            string strOutputXhtml = String.Empty;

            if (strInputHtml == null || strInputHtml == "")
            {
                return("<html></html>");
            }

            SgmlReader rd = new SgmlReader();

            rd.DocType = "HTML";

            StringReader sr = new System.IO.StringReader(strInputHtml);

            rd.InputStream = sr;

            StringWriter  sw = new StringWriter();
            XmlTextWriter xw = new XmlTextWriter(sw);

            rd.Read();
            while (!rd.EOF)
            {
                try
                {
                    xw.WriteNode(rd, true);
                }
                catch { break; }
            }
            xw.Flush();
            xw.Close();

            return(sw.ToString());
        }
Beispiel #8
0
        void Process(SgmlReader reader, string uri, bool loadAsStream) {   
            if (uri == null) {
                reader.InputStream = Console.In;
            } 
            else if (loadAsStream) {
                Uri location = new Uri(uri);
                if (location.IsFile) {   
                    reader.InputStream = new StreamReader(uri);
                } else {
                    WebRequest wr = WebRequest.Create(location);
                    reader.InputStream = new StreamReader(wr.GetResponse().GetResponseStream());
                }
            } else {
                reader.Href = uri;
            }

            if (debug) {
                Debug(reader);
                reader.Close();
                return;
            } 
            if (crawl) {
                StartCrawl(reader, uri, basify);
                return;
            } 

            if (this.encoding == null) {
                this.encoding = reader.GetEncoding();
            }

            
            XmlTextWriter w = null;
            if (output != null) {
                w = new XmlTextWriter(output, this.encoding);          
            } 
            else {
                w = new XmlTextWriter(Console.Out);
            }
            if (formatted) w.Formatting = Formatting.Indented;
            if (!noxmldecl) {
                w.WriteStartDocument();
            }
            if (testdoc) {
                XmlDocument doc = new XmlDocument();
                try {
                    doc.Load(reader);
                    doc.WriteTo(w);
                } catch (XmlException e) {
                    Console.WriteLine("Error:" + e.Message);
                    Console.WriteLine("at line " + e.LineNumber + " column " + e.LinePosition);
                }
            } else {
                reader.Read();
                while (!reader.EOF) {
                    w.WriteNode(reader, true);
                }
            }
            w.Flush();
            w.Close();          
        }
        /// <summary>
        /// 获取xml中的数据  根据
        /// </summary>
        /// <param name="htmlString"></param>
        /// <param name="xpath"></param>
        /// <returns></returns>
        public static string GetWellFormedHTML(string htmlString, string xpath)
        {
            if (htmlString.Trim().Length < 10)
            {
                return("");
            }
            htmlString = htmlString.Replace("xmlns", "buyao");
            StringWriter  sw     = null;
            SgmlReader    reader = null;
            XmlTextWriter writer = null;

            try
            {
                reader             = new SgmlReader();
                reader.DocType     = "HTML";
                reader.InputStream = new StringReader(htmlString);
                sw                = new StringWriter();
                writer            = new XmlTextWriter(sw);
                writer.Formatting = Formatting.Indented;
                writer.WriteStartDocument();
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        try
                        {
                            //如果出错 抛弃此节点
                            writer.WriteNode(reader, true);
                        }
                        catch (Exception e)
                        {
                        }
                    }
                }
                if (xpath == null)
                {
                    return(sw.ToString());
                }
                else
                {
                    StringBuilder     sb    = new StringBuilder();
                    XPathDocument     doc   = new XPathDocument(new StringReader(sw.ToString()));
                    XPathNavigator    nav   = doc.CreateNavigator();
                    XPathNodeIterator nodes = nav.Select(xpath);
                    while (nodes.MoveNext())
                    {
                        sb.Append(nodes.Current.OuterXml + " ");
                    }
                    return(sb.ToString());
                }
            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                return("");
            }
        }
Beispiel #10
0
        void Debug(SgmlReader sr) {
            NodeTypeFlags[] AllowedContentMap = new NodeTypeFlags[19] {
                                                                          NodeTypeFlags.None, // none
                                                                          NodeTypeFlags.Element | NodeTypeFlags.Attribute | NodeTypeFlags.Text | NodeTypeFlags.CDATA | NodeTypeFlags.EntityReference | NodeTypeFlags.ProcessingInstruction | NodeTypeFlags.Comment | NodeTypeFlags.Whitespace | NodeTypeFlags.SignificantWhitespace | NodeTypeFlags.EndElement, // element
                                                                          NodeTypeFlags.Text | NodeTypeFlags.EntityReference, // attribute
                                                                          NodeTypeFlags.None, // text
                                                                          NodeTypeFlags.None, // cdata
                                                                          NodeTypeFlags.None, // entity reference
                                                                          NodeTypeFlags.None, // entity
                                                                          NodeTypeFlags.None, // processing instruction
                                                                          NodeTypeFlags.None, // comment
                                                                          NodeTypeFlags.Comment | NodeTypeFlags.DocumentType | NodeTypeFlags.Element | NodeTypeFlags.EndElement | NodeTypeFlags.ProcessingInstruction | NodeTypeFlags.Whitespace | NodeTypeFlags.SignificantWhitespace | NodeTypeFlags.XmlDeclaration, // document
                                                                          NodeTypeFlags.None, // document type
                                                                          NodeTypeFlags.None, // document fragment (not expecting these)
                                                                          NodeTypeFlags.None, // notation
                                                                          NodeTypeFlags.None, // whitespace
                                                                          NodeTypeFlags.None, // signification whitespace
                                                                          NodeTypeFlags.None, // end element
                                                                          NodeTypeFlags.None, // end entity
                                                                          NodeTypeFlags.None, // filler
                                                                          NodeTypeFlags.None, // xml declaration.
            };

            Stack s = new Stack();

            while (sr.Read()) {
                if (sr.NodeType == XmlNodeType.EndElement) {
                    s.Pop();
                }
                if (s.Count > 0) {
                    XmlNodeType pt = (XmlNodeType)s.Peek();
                    NodeTypeFlags p = NodeTypeMap[(int)pt];
                    NodeTypeFlags f = NodeTypeMap[(int)sr.NodeType];
                    if ((AllowedContentMap[(int)pt]& f) != f) {
                        Console.WriteLine("Invalid content!!");
                    }
                }
                if (s.Count != sr.Depth-1) {
                    Console.WriteLine("Depth is wrong!");
                }
                if ( (sr.NodeType == XmlNodeType.Element && !sr.IsEmptyElement) ||
                    sr.NodeType == XmlNodeType.Document) {
                    s.Push(sr.NodeType);
                }

                for (int i = 1; i < sr.Depth; i++) 
                    Console.Write("  ");
                Console.Write(sr.NodeType.ToString() + " " + sr.Name);
                if (sr.NodeType == XmlNodeType.Element && sr.AttributeCount > 0) {
                    sr.MoveToAttribute(0);
                    Console.Write(" (" + sr.Name+"="+sr.Value + ")");
                    sr.MoveToElement();
                }       
                if (sr.Value != null) {
                    Console.Write(" " + sr.Value.Replace("\n"," ").Replace("\r",""));
                }
                Console.WriteLine();
            }
        }
Beispiel #11
0
        private ResultInfo getDetail(string backstring)
        {
            SgmlReader reader = new SgmlReader();

            reader.DocType = "HTML";

            reader.InputStream = new StringReader(backstring);

            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }


            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);

            xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml");

            XPathNavigator nav = doc.CreateNavigator();

            /////////////////根据网页返回结果分析

            string            xpath    = "//bottum:table[@id='ctl00_ContentPlaceHolder1_TrackDetail']/bottum:tr/bottum:td/bottum:div[8]/bottum:table/bottum:tr/bottum:td";
            XPathNodeIterator nodes    = nav.Select(xpath, xnm);//xpath表达式
            ResultInfo        backinfo = new ResultInfo(querynum);

            if (nodes.Count > 3)
            {
                nodes.MoveNext();
                nodes.MoveNext();
                nodes.MoveNext();
            }
            for (int i = 1; i < nodes.Count / 3; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
            }
            reader.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Beispiel #12
0
        public ResultInfo getDetail(string backstring)
        {
            SgmlReader reader = new SgmlReader();

            reader.DocType     = "HTML";
            reader.InputStream = new StringReader(backstring);
            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }


            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));

            reader.Close();
            writer.Close();
            sw.Close();
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);

            xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml");
            XPathNavigator    nav   = doc.CreateNavigator();
            string            xpath = "//bottum:table[@id='GridView1']/bottum:tr/bottum:td";
            XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式

            if (nodes != null)
            {
                int        count    = nodes.Count;
                int        k        = count / 3;
                ResultInfo backinfo = new ResultInfo(queryNumber);
                for (int i = 0; i < k; i++)
                {
                    nodes.MoveNext();
                    nodes.MoveNext();
                    string time = nodes.Current.Value;
                    nodes.MoveNext();
                    string state = nodes.Current.Value;
                    backinfo.add(time, state);
                }

                return(backinfo);
            }
            else
            {
                return(new ResultInfo(queryNumber));
            }
        }
        // parse the document from the start, and try to
        // figure out where the opening tag matching our closing tag starts
        private SnapshotSpan?FindOpeningTag(ITextSnapshot snapshot, int searchEnd, string searchFor)
        {
            String textToSearch = snapshot.GetText(0, searchEnd);
            int    origLineNum  = snapshot.GetLineNumberFromPosition(searchEnd);

            using (SgmlReader reader = new SgmlReader()) {
                reader.InputStream        = new StringReader(textToSearch);
                reader.WhitespaceHandling = WhitespaceHandling.All;
                try {
                    Stack <int> openingPositions = new Stack <int>();
                    while (reader.Read())
                    {
                        if (reader.LocalName != searchFor)
                        {
                            continue;
                        }
                        if (reader.NodeType == XmlNodeType.Element && !reader.IsEmptyElement)
                        {
                            // find close to where the tag starts
                            int lineNum  = reader.LineNumber - 1;
                            var line     = snapshot.GetLineFromLineNumber(lineNum);
                            int position = line.Start.Position + reader.LinePosition - searchFor.Length;
                            position = BacktrackToLessThan(snapshot, position);
                            String textFound = snapshot.GetText(position, 10);
                            openingPositions.Push(position);
                        }
                        else if (reader.NodeType == XmlNodeType.EndElement)
                        {
                            if (openingPositions.Count <= 0)
                            {
                                // document is malformed, so just get the heck out
                                return(null);
                            }
                            var line     = snapshot.GetLineFromLineNumber(reader.LineNumber - 1);
                            int position = line.Start.Position + reader.LinePosition;
                            if (position >= searchEnd)
                            {
                                break;
                            }
                            openingPositions.Pop();
                        }
                    }
                    // done, last
                    if (openingPositions.Count > 0)
                    {
                        int position = openingPositions.Pop();
                        return(new SnapshotSpan(snapshot, position, searchFor.Length + 2));
                    }
                } catch (Exception ex) {
                    Trace.WriteLine(String.Format("Exception while parsing document: {0}.", ex.ToString()));
                }
            }
            return(null);
        }
Beispiel #14
0
        private ResultInfo getDetail(string backstring)
        {
            SgmlReader reader = new SgmlReader();

            reader.DocType     = "HTML";
            reader.InputStream = new StringReader(backstring);
            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }
            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);
            XPathNavigator      nav = doc.CreateNavigator();

            /////////////////根据网页返回结果分析

            string            xpath = "//table[1]/tr/td";
            string            str   = "";
            XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式

            ResultInfo backinfo = new ResultInfo(querynum);

            if (nodes.Count >= 4)
            {
                nodes.MoveNext();
                nodes.MoveNext();
                nodes.MoveNext();
                nodes.MoveNext();
            }
            for (int i = 4; i < nodes.Count / 2; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
                nodes.MoveNext();
            }
            reader.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
        internal static ImageInfo[] FindImgs(
            string htmlCode)
        {
            var r =
                new SgmlReader
            {
                DocType     = @"HTML",
                InputStream = new StringReader(htmlCode)
            };
            var al = new List <ImageInfo>();

            //find <img src=""
            while (r.Read())
            {
                if (r.NodeType == XmlNodeType.Element)
                {
                    if (String.Compare(r.Name, @"img", StringComparison.OrdinalIgnoreCase) == 0)
                    {
                        if (r.HasAttributes)
                        {
                            var ii = new ImageInfo();

                            while (r.MoveToNextAttribute())
                            {
                                switch (r.Name.ToLowerInvariant())
                                {
                                case @"src":
                                    ii.Source = r.Value;
                                    break;

                                case @"width":
                                    ii.Width = ConvertHelper.ToInt32(r.Value);
                                    break;

                                case @"height":
                                    ii.Height = ConvertHelper.ToInt32(r.Value);
                                    break;
                                }
                            }

                            // --

                            if (!String.IsNullOrEmpty(ii.Source))
                            {
                                al.Add(ii);
                            }
                        }
                    }
                }
            }

            return(al.ToArray());
        }
Beispiel #16
0
        public ResultInfo getDetail(string backstring)
        {
            backstring = backstring.Replace("xmlns=\"http://www.w3.org/1999/xhtml\"", "");
            SgmlReader reader = new SgmlReader();

            reader.DocType = "HTML";

            reader.InputStream = new StringReader(backstring);

            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }


            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);

            xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml");
            XPathNavigator    nav   = doc.CreateNavigator();
            string            xpath = "/html/body/table[8]/tr/td";
            XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式

            if (nodes.Count >= 2)
            {
                nodes.MoveNext();
                nodes.MoveNext();
            }
            ResultInfo backinfo = new ResultInfo(querynum);

            for (int i = 1; i < nodes.Count / 2; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
            }
            reader.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Beispiel #17
0
        protected internal static void AutoCloseTags(SgmlReader reader, XmlWriter writer)
        {
            object msgBody = reader.NameTable.Add("MSGBODY");

            object previousElement        = null;
            Stack  elementsWeAlreadyEnded = new Stack();

            while (reader.Read())
            {
                switch (reader.NodeType)
                {
                case XmlNodeType.Element:
                    previousElement = reader.LocalName;
                    writer.WriteStartElement(reader.LocalName);
                    break;

                case XmlNodeType.Text:
                    if (String.IsNullOrEmpty(reader.Value) == false)
                    {
                        writer.WriteString(reader.Value.Trim());
                        if (previousElement != null && !previousElement.Equals(msgBody))
                        {
                            writer.WriteEndElement();
                            elementsWeAlreadyEnded.Push(previousElement);
                        }
                    }
                    else
                    {
                        Debug.Assert(true, "big problems?");
                    }
                    break;

                case XmlNodeType.EndElement:
                    if (elementsWeAlreadyEnded.Count > 0 &&
                        Object.ReferenceEquals(elementsWeAlreadyEnded.Peek(),
                                               reader.LocalName))
                    {
                        elementsWeAlreadyEnded.Pop();
                    }
                    else
                    {
                        writer.WriteEndElement();
                    }
                    break;

                default:
                    writer.WriteNode(reader, false);
                    break;
                }
            }
        }
Beispiel #18
0
        /// <summary>
        /// Gets the title out of the HTML head section.
        /// </summary>
        /// <param name="url">The URL of the page</param>
        /// <param name="defaultIfNoMatch">string to return, if no match was found</param>
        /// <param name="credentials">Credentials for authenticating the request</param>
        /// <param name="proxy">Proxy server to direct the request through</param>
        /// <returns></returns>
        //dup to FindTitle2() - which one we should use?
        public static string FindTitle(string url, string defaultIfNoMatch, IWebProxy proxy, ICredentials credentials)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.AllowAutoRedirect = true;
            request.Proxy             = proxy;
            request.Credentials       = credentials;
            request.Timeout           = 5 * 1000 /* 5 second timeout */;

            if (FeedSource.SetCookies)
            {
                HttpCookieManager.SetCookies(request);
            }

            /* use bogus user agent since some sites will bounce you to unsupported browser page otherwise */
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)";

            string title  = defaultIfNoMatch;
            Stream stream = null;

            try
            {
                stream = request.GetResponse().GetResponseStream();

                SgmlReader reader = new SgmlReader();
                reader.InputStream = new StreamReader(stream);

                while (reader.Read())
                {
                    if ((reader.NodeType == XmlNodeType.Element) && (reader.Name.ToLower().Equals("title")))
                    {
                        title = reader.ReadElementContentAsString();
                        stream.Flush();
                        break;
                    }
                } //while
            }
            catch (Exception e)
            {
                _log.Debug("Error retrieving title from HTML page at " + url, e);
            }
            finally
            {
                if (stream != null)
                {
                    stream.Close();
                }
            }

            return(title);
        }
Beispiel #19
0
 void RunTest(SgmlReader reader, int line, string args, string input, string expectedOutput){
     bool testdoc = false;
     foreach (string arg in args.Split(' ')){
         string sarg = arg.Trim();
         if (sarg.Length==0) continue;
         if (sarg[0] == '-'){
             switch (sarg.Substring(1)){
                 case "html":
                     reader.DocType = "html";
                     break;
                 case "lower":
                     reader.CaseFolding = CaseFolding.ToLower;
                     break;
                 case "upper":
                     reader.CaseFolding = CaseFolding.ToUpper;
                     break;
                 case "testdoc":
                     testdoc = true;
                     break;
             }
         }
     }
     this.tests++;
     reader.InputStream = new StringReader(input);
     reader.WhitespaceHandling = WhitespaceHandling.None;
     StringWriter output = new StringWriter();
     XmlTextWriter w = new XmlTextWriter(output);
     w.Formatting = Formatting.Indented;
     if (testdoc) {
         XmlDocument doc = new XmlDocument();
         doc.Load(reader);
         doc.WriteTo(w);
     } else {
         reader.Read();
         while (!reader.EOF) {
             w.WriteNode(reader, true);
         }
     }            
     w.Close();
     string actualOutput = output.ToString();
     if (actualOutput.Trim() != expectedOutput.Trim()) {
         Console.WriteLine("ERROR: Test failed on line {0}", line);
         Console.WriteLine("---- Expected output");
         Console.WriteLine(expectedOutput);
         Console.WriteLine("---- Actual output");
         Console.WriteLine(actualOutput);
     } else {
         this.passed++;
     }
 }
        public static IEnumerable <string> GetAttributeValues(this string html, string tagName, string attributeName)
        {
            var reader = new SgmlReader
            {
                DocType            = "html",
                WhitespaceHandling = WhitespaceHandling.All,
                InputStream        = new StringReader(string.Format("<html>{0}</html>", html))
            };

            while (reader.Read() && !reader.EOF)
            {
                if (reader.NodeType == XmlNodeType.Element && reader.LocalName == tagName)
                {
                    yield return(reader.GetAttribute(attributeName));
                }
            }
        }
Beispiel #21
0
        public void Test_MoveToNextAttribute()
        {
            // Make sure we can do MoveToElement after reading multiple attributes.
            var r = new SgmlReader {
                InputStream = new StringReader("<test id='10' x='20'><a/><!--comment-->test</test>")
            };

            Assert.IsTrue(r.Read());
            while (r.MoveToNextAttribute())
            {
                _log.Debug(r.Name);
            }
            if (r.MoveToElement())
            {
                _log.Debug(r.ReadInnerXml());
            }
        }
Beispiel #22
0
        void RegressionTest1()
        {
            // Make sure we can do MoveToElement after reading multiple attributes.
            SgmlReader r = new SgmlReader();

            r.InputStream = new StringReader("<test id='10' x='20'><a/><!--comment-->test</test>");
            if (r.Read())
            {
                while (r.MoveToNextAttribute())
                {
                    Trace.WriteLine(r.Name);
                }
                if (r.MoveToElement())
                {
                    Trace.WriteLine(r.ReadInnerXml());
                }
            }
        }
Beispiel #23
0
        private ResultInfo getDetail(string backstring)
        {
            //////////////用sgml库分析网页,转换成xml文件
            SgmlReader readern = new SgmlReader();

            readern.DocType     = "HTML";
            readern.InputStream = new StringReader(backstring);
            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            readern.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting          = Formatting.Indented;
            while (!readern.EOF)
            {
                readern.Read();
                if (readern.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(readern, true);
                }
            }
            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm    = new XmlNamespaceManager(doc.NameTable);
            XPathNavigator      nav    = doc.CreateNavigator();
            string            xpath    = "//div[@id='ess_ctr1579_TrackResult_DivBill']/table[2]/tr[@class='font_c']/td";
            XPathNodeIterator nodes    = nav.Select(xpath, xnm);//xpath表达式
            ResultInfo        backinfo = new ResultInfo(queryNumber);

            for (int i = 0; i < nodes.Count / 2; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
            }
            readern.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Beispiel #24
0
        internal static void Convert(String htmlFile, String xhtmlFile)
        {
            using (SgmlReader reader = new SgmlReader()) {
                reader.DocType            = "HTML";
                reader.WhitespaceHandling = WhitespaceHandling.None;

                using (StreamReader r = new StreamReader(htmlFile)) {
                    reader.InputStream = r;

                    using (XmlTextWriter writer = new XmlTextWriter(xhtmlFile, Encoding.UTF8)) {
                        writer.Formatting = Formatting.Indented;

                        reader.Read();
                        while (!reader.EOF)
                        {
                            writer.WriteNode(reader, true);
                        }
                    }
                }
            }
        }
Beispiel #25
0
        public static string TransformHtmlToXHTML(string inputHtml)
        {
            var sgmlReader = new SgmlReader {
                DocType = "HTML",
            };
            var stringReader = new StringReader(inputHtml);

            sgmlReader.InputStream = stringReader;

            var stringWriter = new StringWriter();

            using (var xmlWriter = new XmlTextWriter(stringWriter))
            {
                sgmlReader.Read();

                while (!sgmlReader.EOF)
                {
                    xmlWriter.WriteNode(sgmlReader, true);
                }
            }
            return(RemoveCopyOfImage(stringWriter.ToString()));
        }
Beispiel #26
0
    static void SaveAsXml(string url, string fileToSave)
    {
        if (File.Exists(fileToSave))
        {
            return;
        }

        var stream = new XmlUrlResolver().GetEntity(new Uri(url), null, typeof(Stream)) as Stream;
        var xr     = new SgmlReader()
        {
            InputStream = new StreamReader(stream)
        };
        var xw = XmlWriter.Create(fileToSave);

        xr.MoveToContent();
        do
        {
            xw.WriteNode(xr, false);
            xw.Flush();
        } while (xr.Read());
        xw.Close();
    }
Beispiel #27
0
        private void Process(SgmlReader reader, string uri)
        {
            if (uri == null)
            {
                reader.InputStream = Console.In;
            }
            else
            {
                reader.Href = uri;
            }

            encoding ??= reader.GetEncoding();
            if (noUtf8Bom && encoding.Equals(Encoding.UTF8))
            {
                encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
            }

            XmlTextWriter w = output != null
                ? new XmlTextWriter(output, encoding)
                : new XmlTextWriter(Console.Out);

            using (w)
            {
                if (formatted)
                {
                    w.Formatting = Formatting.Indented;
                }
                if (!noxmldecl)
                {
                    w.WriteStartDocument();
                }
                reader.Read();
                while (!reader.EOF)
                {
                    w.WriteNode(reader, true);
                }
                w.Flush();
            }
        }
Beispiel #28
0
        public void Test_fragment_parsing()
        {
            XmlReaderSettings settings = new XmlReaderSettings();

            settings.ConformanceLevel = ConformanceLevel.Fragment;
            StringReader stream = new StringReader("<html><head></head><body></body></html> <script></script>");

            int        count  = 0;
            SgmlReader reader = new SgmlReader(settings);

            reader.DocType     = "html";
            reader.InputStream = stream;
            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Element)
                {
                    XDocument doc = XDocument.Load(reader.ReadSubtree());
                    Debug.WriteLine(doc.ToString());
                    count++;
                }
            }
            Assert.AreEqual(2, count, "Expecing 2 XmlDocuments in the input stream");
        }
Beispiel #29
0
        /// <summary>
        /// Finds the images.
        /// </summary>
        /// <param name="htmlCode">The HTML code.</param>
        /// <returns></returns>
        internal static string[] FindImgs(
            string htmlCode)
        {
            var r =
                new SgmlReader
            {
                DocType     = @"HTML",
                InputStream = new StringReader(htmlCode)
            };
            var al = new List <string>();

            //find <img src=""
            while (r.Read())
            {
                if (r.NodeType == XmlNodeType.Element)
                {
                    if (string.Compare(r.Name, @"img", true) == 0)
                    {
                        if (r.HasAttributes)
                        {
                            while (r.MoveToNextAttribute())
                            {
                                if (r.Name.ToLower() == @"src")
                                {
                                    if (!al.Contains(r.Value))
                                    {
                                        al.Add(r.Value);
                                    }
                                }
                            }
                        }
                    }
                }
            }

            return(al.ToArray());
        }
Beispiel #30
0
        private static string ConvertCommentToMarkdown(string body)
        {
            var sb = new StringBuilder();

            var sgmlReader = new SgmlReader
            {
                InputStream        = new StringReader(body),
                DocType            = "HTML",
                WhitespaceHandling = WhitespaceHandling.Significant,
                CaseFolding        = CaseFolding.ToLower
            };

            bool outputEndElement = false;
            int  indentLevel      = 0;

            while (sgmlReader.Read())
            {
                switch (sgmlReader.NodeType)
                {
                case XmlNodeType.Text:
                    if (indentLevel > 0)
                    {
                        sb.Append("\t");
                    }
                    sb.AppendLine(sgmlReader.Value);
                    break;

                case XmlNodeType.Element:
                    switch (sgmlReader.LocalName)
                    {
                    case "h1":
                        sb.Append("## ");
                        break;

                    case "br":
                        sb.AppendLine("  ");
                        break;

                    case "a":
                        if (sgmlReader.MoveToAttribute("href"))
                        {
                            string url = sgmlReader.Value;
                            sgmlReader.Read();

                            sb.AppendFormat("[{0}]({1})", sgmlReader.Value, url);
                        }
                        break;

                    case "html":
                        break;

                    case "strong":
                    case "b":
                        sb.AppendFormat("**{0}**", sgmlReader.Value);
                        break;

                    case "i":
                    case "em":
                        sb.AppendFormat("_{0}_", sgmlReader.Value);
                        break;

                    case "li":
                        sb.AppendFormat("- {0}", sgmlReader.Value);
                        break;

                    case "pre":
                    case "code":
                    case "quote":
                        indentLevel = 1;
                        break;

                    case "ul":
                    case "ol":
                    case "img":
                        break;

                    default:
                        outputEndElement = true;
                        sb.Append("<").Append(sgmlReader.LocalName);
                        break;
                    }
                    break;

                case XmlNodeType.SignificantWhitespace:
                case XmlNodeType.Whitespace:
                case XmlNodeType.CDATA:
                    break;

                case XmlNodeType.EndElement:
                    indentLevel = 0;
                    if (outputEndElement)
                    {
                        sb.Append(">");
                    }
                    outputEndElement = false;
                    break;

                default:
                    throw new ArgumentOutOfRangeException();
                }
            }

            return(sb.ToString());
        }