HTMLparser.Init C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

 public HtmlLookup(string doc)
 {
     if (string.IsNullOrEmpty(doc) == false)
     {
         _parser = new HTMLparser();
         _parser.Init(doc);
     }
 }

예제 #2

0

파일 보기

파일: BaiduParser.cs 프로젝트: colt365/lunar-thu

        public SearchEngineResult Parse(string html, Encoding encoding)
        {
            HTMLparser oP = HtmlParserFactory.GetInstance();

            searchResult = new SearchEngineResult();
            searchResult.SearchEngineType = SearchEngineType.Baidu;
            item = new SearchEngineResult.ResultItem();
            //item.Source = "Baidu";
            oP.Init(encoding.GetBytes(html));
            oP.SetEncoding(encoding);
            HTMLchunk oChunk = null;

            int  state        = 0;
            bool bEncodingSet = false;

            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case HTMLchunkType.OpenTag:
                    HandleOpenTag(oChunk, ref state);

printParams:
                    if (oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;
                    HandleParam(oChunk, ref state);


                    break;

                case HTMLchunkType.CloseTag:
                    HandleCloseTag(oChunk, ref state);
                    break;

                case HTMLchunkType.Text:
                    HandleText(oChunk, ref state);
                    break;

                default:
                    break;
                }
            }
            return(searchResult);
        }

예제 #3

0

파일 보기

파일: DictCnParser.cs 프로젝트: colt365/lunar-thu

        public SmartMe.Core.Data.DictResult Parse(string html, Encoding encoding)
        {
            dictResult = new DictResult();
            HTMLparser oP = HtmlParserFactory.GetInstance();

            dictResult.DictionaryType = DictionaryType.Dict_cn;

            oP.Init(encoding.GetBytes(html));
            oP.SetEncoding(encoding);
            HTMLchunk oChunk = null;

            int  state        = 0;
            bool bEncodingSet = false;

            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case  HTMLchunkType.OpenTag:
                    HandleOpenTag(oChunk, ref state);

printParams:
                    if (oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;
                    HandleParam(oChunk, ref state);


                    break;

                case HTMLchunkType.CloseTag:
                    HandleCloseTag(oChunk, ref state);
                    break;

                case HTMLchunkType.Text:
                    HandleText(oChunk, ref state);
                    break;

                default:
                    break;
                }
            }

            return(dictResult);
        }

예제 #4

0

파일 보기

파일: HtmlSerializer.cs 프로젝트: pnoodles/hfm-net

        private static List <Protein> ParseProteins(string html)
        {
            var htmlParser = new HTMLparser();

            htmlParser.Init(html);
            var list = new List <Protein>();

            HTMLchunk chunk;

            while ((chunk = htmlParser.ParseNext()) != null)
            {
                // Look for an Open "tr" Tag
                if (chunk.oType.Equals(HTMLchunkType.OpenTag) &&
                    chunk.sTag.ToLower() == "tr")
                {
                    var p = new Protein();
                    int projectNumber;
                    if (Int32.TryParse(GetNextTdValue(htmlParser), NumberStyles.Integer, CultureInfo.InvariantCulture, out projectNumber))
                    {
                        p.ProjectNumber = projectNumber;
                    }
                    else
                    {
                        continue;
                    }
                    p.ServerIP      = GetNextTdValue(htmlParser);
                    p.WorkUnitName  = GetNextTdValue(htmlParser);
                    p.NumberOfAtoms = ToInt32OrDefault(GetNextTdValue(htmlParser));
                    p.PreferredDays = ToDoubleOrDefault(GetNextTdValue(htmlParser));
                    p.MaximumDays   = ToDoubleOrDefault(GetNextTdValue(htmlParser));
                    p.Credit        = ToDoubleOrDefault(GetNextTdValue(htmlParser));
                    p.Frames        = ToInt32OrDefault(GetNextTdValue(htmlParser));
                    p.Core          = GetNextTdValue(htmlParser);
                    p.Description   = GetNextTdValue(htmlParser, "href");
                    p.Contact       = GetNextTdValue(htmlParser);
                    p.KFactor       = ToDoubleOrDefault(GetNextTdValue(htmlParser));

                    list.Add(p);
                }
            }

            return(list);
        }

예제 #5

0

파일 보기

파일: UtilityHtmlParser.cs 프로젝트: TRYMYOUT/lookin

        public void Open(Session session)
        {
            String charset = "utf-8";

            Parser = new HTMLparser();

            try
            {
                if (Utility.IsResponseHtml(session) || Utility.IsResponseXml(session))
                {
                    Parser.Init(session.responseBodyBytes == null ? new byte[] { } : session.responseBodyBytes);
                    Parser.bAutoKeepScripts  = true;
                    Parser.bEnableHeuristics = false;

                    // When bAutoExtractBetweenTagsOnly is false, the parser will see attributes
                    // in the script tags, such as <script src="mydata">.  Otherwise it will not.
                    Parser.bAutoExtractBetweenTagsOnly = true;
                }
            }
            catch (Exception e)
            {
                Trace.TraceWarning("Warning: UtilityHtmlParser threw an unhandled exception: {0}", e.Message);
                ExceptionLogger.HandleException(e);
            }


            // Get the encoding name from the HTML or HTTP
            charset = Utility.GetHtmlCharset(session);

            try
            {
                // TODO: check if the encoding is a known good before continuing!!!
                // See if the charset name we got is a valid system encoding name.
                // GetEncoding should throw an Argument ex if not.
                Encoding e = Encoding.GetEncoding(charset);
                Parser.SetEncoding(charset);
            }
            catch (ArgumentException e)
            {
                // Default to utf-8 if
                Parser.SetEncoding(new UTF8Encoding(false, false));
            }
        }

예제 #6

0

파일 보기

파일: HtmlParser.cs 프로젝트: prepare/majestic13

        public HtmlNode Parse(string html)
        {
            // Majestic12 doesn't support doctype
            html = dedoctype.Replace(html, "");
            var builder = new HtmlBuilder();

            var parser = new HTMLparser();

            parser.bDecodeEntities = false;
            parser.SetChunkHashMode(true);

            parser.Init(html);
            var chunk = parser.ParseNext();

            while (chunk != null)
            {
                switch (chunk.oType)
                {
                case HTMLchunkType.OpenTag:
                    // if something goes wrong - ignore it
                    if (chunk.sTag != "")
                    {
                        var attributes = new Dictionary <string, string>();
                        if (chunk.iParams != 0)
                        {
                            foreach (string name in chunk.oParams.Keys)
                            {
                                attributes.Add(name, (string)chunk.oParams[name]);
                            }
                        }
                        builder.OpenTag(chunk.sTag, attributes);
                    }
                    break;

                case HTMLchunkType.Comment:
                    builder.AddComment(chunk.oHTML);
                    break;

                case HTMLchunkType.CloseTag:
                    if (chunk.bEndClosure)
                    {
                        var attr = new Dictionary <string, string>();
                        if (chunk.iParams != 0)
                        {
                            foreach (string name in chunk.oParams.Keys)
                            {
                                attr.Add(name, (string)chunk.oParams[name]);
                            }
                        }
                        builder.OpenTag(chunk.sTag, attr);
                        builder.CloseTag(chunk.sTag);
                    }
                    else
                    {
                        builder.CloseTag(chunk.sTag);
                    }
                    break;

                case HTMLchunkType.Script:
                    builder.AddScript(chunk.oHTML);
                    break;

                case HTMLchunkType.Text:
                    builder.AddText(chunk.oHTML);
                    break;

                default:
                    break;
                }
                chunk = parser.ParseNext();
            }
            return(builder.Render());
        }

예제 #7

0

파일 보기

        // ============================================================================================================
        // Parse and create boxes
        // ============================================================================================================

        BlockElement ParseHtmlToBlocks(string html)
        {
            IResourceProvider provider = Service.Get <IResourceProvider>();
            StyleParser       styles = new StyleParser(provider);
            BlockElement      root, currentBlock;

            root = currentBlock = new BlockElement("root", styles.Style); // this is the root!
            // if this is not HTML, do not parse tags. Otherwise search out and interpret tags.
            bool parseHTML = true;

            if (!parseHTML)
            {
                for (int i = 0; i < html.Length; i++)
                {
                    currentBlock.AddAtom(new CharacterElement(styles.Style, html[i]));
                }
            }
            else
            {
                m_Parser.Init(html);
                HTMLchunk chunk;
                while ((chunk = ParseNext(m_Parser)) != null)
                {
                    if (!(chunk.oHTML == string.Empty))
                    {
                        // This is a span of text.
                        string text = chunk.oHTML;
                        // make sure to replace escape characters!
                        text = EscapeCharacters.ReplaceEscapeCharacters(text);
                        //Add the characters to the current box
                        for (int i = 0; i < text.Length; i++)
                        {
                            currentBlock.AddAtom(new CharacterElement(styles.Style, text[i]));
                        }
                    }
                    else
                    {
                        // This is a tag. interpret the tag and edit the openTags list.
                        // It may also be an atom, in which case we should add it to the list of atoms!
                        AElement atom = null;
                        if (chunk.bClosure && !chunk.bEndClosure)
                        {
                            styles.CloseOneTag(chunk);
                            if (currentBlock.Tag == chunk.sTag)
                            {
                                currentBlock = currentBlock.Parent;
                            }
                        }
                        else
                        {
                            bool isBlockTag = false;
                            switch (chunk.sTag)
                            {
                            // ====================================================================================
                            // Anchor elements are added to the open tag collection as HREFs.
                            case "a":
                                styles.InterpretHREF(chunk, null);
                                break;

                            // ====================================================================================
                            // These html elements are ignored.
                            case "body":
                                break;

                            // ====================================================================================
                            // These html elements are blocks but can also have styles
                            case "center":
                            case "left":
                            case "right":
                            case "div":
                                atom = new BlockElement(chunk.sTag, styles.Style);
                                styles.ParseTag(chunk, atom);
                                isBlockTag = true;
                                break;

                            // ====================================================================================
                            // These html elements are styles, and are added to the StyleParser.
                            case "span":
                            case "font":
                            case "b":
                            case "i":
                            case "u":
                            case "outline":
                            case "big":
                            case "basefont":
                            case "medium":
                            case "small":
                                styles.ParseTag(chunk, null);
                                break;

                            // ====================================================================================
                            // These html elements are added as atoms only. They cannot impart style
                            // onto other atoms.
                            case "br":
                                atom = new CharacterElement(styles.Style, '\n');
                                break;

                            case "gumpimg":
                                // draw a gump image
                                atom = new ImageElement(styles.Style, ImageElement.ImageTypes.UI);
                                styles.ParseTag(chunk, atom);
                                break;

                            case "itemimg":
                                // draw a static image
                                atom = new ImageElement(styles.Style, ImageElement.ImageTypes.Item);
                                styles.ParseTag(chunk, atom);
                                break;

                            // ====================================================================================
                            // Every other element is not interpreted, but rendered as text. Easy!
                            default:
                            {
                                string text = html.Substring(chunk.iChunkOffset, chunk.iChunkLength);
                                // make sure to replace escape characters!
                                text = EscapeCharacters.ReplaceEscapeCharacters(text);
                                //Add the characters to the current box
                                for (int i = 0; i < text.Length; i++)
                                {
                                    currentBlock.AddAtom(new CharacterElement(styles.Style, text[i]));
                                }
                            }
                            break;
                            }

                            if (atom != null)
                            {
                                currentBlock.AddAtom(atom);
                                if (isBlockTag && !chunk.bEndClosure)
                                {
                                    currentBlock = (BlockElement)atom;
                                }
                            }
                            styles.CloseAnySoloTags();
                        }
                    }
                }
            }

            return(root);
        }

예제 #8

0

파일 보기

파일: ServerRequestor.cs 프로젝트: octavian-paraschiv/Thorus

        private bool RequestSstFile()
        {
            try
            {
                DateTime dt       = DateTime.Today.AddDays(-10);
                string   getUrl   = ReplaceMacros(_noaaSstGetUrl, dt);
                string   response = "";

                ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate;
                ServicePointManager.SecurityProtocol = (SecurityProtocolType)3072; //TLS 1.2

                using (WebClient wc = new WebClient())
                {
                    response = wc.DownloadString(getUrl);
                }

                HTMLparser parser = new HTMLparser();
                parser.Init(response);

                HTMLchunk chunk = null;

                while ((chunk = parser.ParseNext()) != null)
                {
                    if (chunk.sTag != "a")
                    {
                        continue;
                    }
                    if (chunk.oType != HTMLchunkType.OpenTag)
                    {
                        continue;
                    }
                    if (chunk.oParams == null || chunk.oParams.Count < 1)
                    {
                        continue;
                    }

                    string href = chunk.oParams["href"]?.ToString();
                    if (string.IsNullOrEmpty(href))
                    {
                        continue;
                    }
                    if (href.StartsWith("ftp://"))
                    {
                        string file = Path.Combine(SimulationData.WorkFolder, "SST.nc");
                        if (File.Exists(file) == false)
                        {
                            using (WebClient wc = new WebClient())
                            {
                                wc.DownloadFile(href, file);
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                int s = 0;
            }

            return(false);
        }

예제 #9

0

파일 보기

        static public IEnumerable <XNode> ConvertNodesToXml(byte[] htmlAsBytes)
        {
            HTMLparser parser = OpenParser();

            parser.Init(htmlAsBytes);

            XElement currentNode = new XElement("document");

            HTMLchunk m12chunk = null;

            int    xmlnsAttributeIndex = 0;
            string originalHtml        = "";

            while ((m12chunk = parser.ParseNext()) != null)
            {
                try
                {
                    Debug.Assert(!m12chunk.bHashMode);  // popular default for Majestic-12 setting

                    XNode    newNode        = null;
                    XElement newNodesParent = null;

                    switch (m12chunk.oType)
                    {
                    case HTMLchunkType.OpenTag:

                        // Tags are added as a child to the current tag,
                        // except when the new tag implies the closure of
                        // some number of ancestor tags.

                        newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex);

                        if (newNode != null)
                        {
                            currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode);

                            newNodesParent = currentNode;

                            newNodesParent.Add(newNode);

                            currentNode = newNode as XElement;
                        }

                        break;

                    case HTMLchunkType.CloseTag:

                        if (m12chunk.bEndClosure)
                        {
                            newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex);

                            if (newNode != null)
                            {
                                currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode);

                                newNodesParent = currentNode;
                                newNodesParent.Add(newNode);
                            }
                        }
                        else
                        {
                            XElement nodeToClose = currentNode;

                            string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml);

                            while (nodeToClose != null && nodeToClose.Name.LocalName != m12chunkCleanedTag)
                            {
                                nodeToClose = nodeToClose.Parent;
                            }

                            if (nodeToClose != null)
                            {
                                currentNode = nodeToClose.Parent;
                            }

                            Debug.Assert(currentNode != null);
                        }

                        break;

                    case HTMLchunkType.Script:

                        newNode        = new XElement("script", "REMOVED");
                        newNodesParent = currentNode;
                        newNodesParent.Add(newNode);
                        break;

                    case HTMLchunkType.Comment:

                        newNodesParent = currentNode;

                        if (m12chunk.sTag == "!--")
                        {
                            newNode = new XComment(m12chunk.oHTML);
                        }
                        else if (m12chunk.sTag == "![CDATA[")
                        {
                            newNode = new XCData(m12chunk.oHTML);
                        }
                        else
                        {
                            throw new Exception("Unrecognized comment sTag");
                        }

                        newNodesParent.Add(newNode);

                        break;

                    case HTMLchunkType.Text:

                        currentNode.Add(m12chunk.oHTML);
                        break;

                    default:
                        break;
                    }
                }
                catch (Exception e)
                {
                    var wrappedE = new Exception("Error using Majestic12.HTMLChunk, reason: " + e.Message, e);

                    // the original html is copied for tracing/debugging purposes
                    originalHtml = new string(htmlAsBytes.Skip(m12chunk.iChunkOffset)
                                              .Take(m12chunk.iChunkLength)
                                              .Select(B => (char)B).ToArray());

                    wrappedE.Data.Add("source", originalHtml);

                    throw wrappedE;
                }
            }

            while (currentNode.Parent != null)
            {
                currentNode = currentNode.Parent;
            }

            return(currentNode.Nodes());
        }

예제 #10

0

파일 보기

파일: ProteinCollection.cs 프로젝트: BGCX067/fahlogstats-net-svn-to-git

        /// <summary>
        /// Download project information from Stanford University (psummaryC.html)
        /// </summary>
        /// <param name="State">Null in this implementation</param>
        public void DownloadFromStanford(Object State /* null */)
        {
            DateTime Start = Debug.ExecStart;

            lock (this)
            {
                Preferences.PreferenceSet Prefs = Preferences.PreferenceSet.Instance;

                WebRequest wrq = (WebRequest)WebRequest.Create("http://vspx27.stanford.edu/psummaryC.html");
                wrq.Method = WebRequestMethods.Http.Get;
                WebResponse  wrs;
                StreamReader sr1;
                if (Prefs.UseProxy)
                {
                    wrq.Proxy = new WebProxy(Prefs.ProxyServer, Prefs.ProxyPort);
                    if (Prefs.UseProxyAuth)
                    {
                        wrq.Proxy.Credentials = new NetworkCredential(Prefs.ProxyUser, Prefs.ProxyPass);
                    }
                }
                else
                {
                    wrq.Proxy = null;
                }

                // TODO: Handle timeouts and errors
                try
                {
                    wrs = (WebResponse)wrq.GetResponse();
                    sr1 = new StreamReader(wrs.GetResponseStream(), Encoding.ASCII);

                    if ((wrs == null) || (sr1 == null))
                    {
                        throw new IOException("The web response or stream was null");
                    }
                }
                catch (WebException ExWeb)
                {
                    ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, ExWeb.Message), null);
                    ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
                    return;
                }
                catch (IOException ExIO)
                {
                    ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw IOException {1}.", Debug.FunctionName, ExIO.Message), null);
                    ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
                    return;
                }
                catch (Exception Ex)
                {
                    ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, Ex.Message), null);
                    ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
                    return;
                }

                HTMLparser pSummary     = new HTMLparser();
                String     sSummaryPage = sr1.ReadToEnd();
                pSummary.Init(sSummaryPage);

                // Locate the table
                HTMLchunk oChunk = null;

                // Parse until returned oChunk is null indicating we reached end of parsing
                while ((oChunk = pSummary.ParseNext()) != null)
                {
                    if (oChunk.sTag.ToLower() == "tr")
                    {
                        Protein p = new Protein();
                        while (((oChunk = pSummary.ParseNext()) != null) && (oChunk.sTag.ToLower() != "td"))
                        {
                            ; // Do nothing!
                        }
                        // Skip the empty attributes
                        oChunk = pSummary.ParseNext();
                        try
                        {
                            #region Parse Code for HTML Table
                            // Suck out the project number
                            p.ProjectNumber = Int32.Parse(oChunk.oHTML.ToString());

                            // Skip the closing tag, opening tags and attributes
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            p.ServerIP = oChunk.oHTML.ToString().Trim();

                            // Skip the closing tag, opening tags and attributes
                            oChunk         = pSummary.ParseNext();
                            oChunk         = pSummary.ParseNext();
                            oChunk         = pSummary.ParseNext();
                            p.WorkUnitName = oChunk.oHTML.ToString().Trim();

                            // Skip the closing tag, opening tags and attributes
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            p.NumAtoms = Int32.Parse(oChunk.oHTML.ToString());

                            // Skip the closing tag, opening tags and attributes
                            oChunk          = pSummary.ParseNext();
                            oChunk          = pSummary.ParseNext();
                            oChunk          = pSummary.ParseNext();
                            p.PreferredDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim());

                            // Skip the closing tag, opening tags and attributes
                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            try
                            {
                                p.MaxDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim());
                            }
                            catch
                            {
                                p.MaxDays = 0;
                            }

                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            p.Credit = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim());

                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            p.Frames = Int32.Parse(oChunk.oHTML.ToString().Trim());

                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            p.Core = oChunk.oHTML.ToString();

                            oChunk        = pSummary.ParseNext();
                            oChunk        = pSummary.ParseNext();
                            oChunk        = pSummary.ParseNext();
                            p.Description = oChunk.oParams["href"].ToString();

                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            p.Contact = oChunk.oHTML.ToString();
                            #endregion

                            if (this.ContainsKey(p.ProjectNumber))
                            {
                                this[p.ProjectNumber] = p;
                            }
                            else
                            {
                                this.Add(p.ProjectNumber, p);
                            }
                        }
                        catch (Exception Ex)
                        {
                            // Ignore this row of the table - unparseable
                            ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw exception while parsing HTML: {1}", Debug.FunctionName, Ex.Message), null);
                        }
                    }
                }
                if (this.Count > 0)
                {
                    OnNFOUpdated(new NFOUpdatedEventArgs());
                }
            }

            SaveToCSV(_LocalNFOFile);

            ClassLogger.Log(LogLevel.Trace, String.Format("{0} loaded {1} proteins from Stanford", Debug.FunctionName, ProteinCollection.Instance.Count), "");
            ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
            return;
        }

예제 #11

0

파일 보기

        /// <summary>
        /// Tests parser by parsing chunk of data and then generating HTML on the basis of parsing
        /// and comparing this to expected HTML: in case of any discrepancies assertion will be fired
        /// </summary>
        /// <param name="bData">Data to parse</param>
        /// <param name="sExpectedHTML">Expected HTML as it gets generated by this very function</param>
        void TestParser(byte[] bData, string sExpectedHTML)
        {
            if (sExpectedHTML == null)
            {
                return;
            }

            StringBuilder oSB = new StringBuilder(512);

            bool bEncodingSet = false;

            oP.Init(bData);

            // ok lets parse HTML and save the HTML that we view back into string
            HTMLchunk oChunk;

            // we don't want to use hashes as they would change order in which params are made
            oP.SetChunkHashMode(false);

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case HTMLchunkType.OpenTag:

                    oSB.AppendFormat("<{0}", oChunk.sTag);

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        if (!bEncodingSet)
                        {
                            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
                            {
                                if (bEncodingSet)
                                {
                                    // possible Title re-encoding should happen here
                                }
                            }
                        }
                    }

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine(oChunk.GenerateParamsHTML());


                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    oSB.AppendFormat(" {0}='{1}'", sParam, oP.ChangeToEntities(sValue));
                                }
                                else
                                {
                                    oSB.AppendFormat(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                string sValue = oChunk.sValues[i];

                                if (oChunk.bEntities)
                                {
                                    sValue = oP.ChangeToEntities(sValue);
                                }

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        oSB.AppendFormat(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        oSB.AppendFormat(" {0}={1}", oChunk.sParams[i], sValue);
                                    }
                                    break;

                                default:
                                    oSB.AppendFormat(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], sValue);
                                    break;
                                }
                            }
                        }
                    }

                    if (oChunk.bClosure && !oP.bAutoMarkClosedTagsWithParamsAsOpen)
                    {
                        oSB.Append("/>");
                    }
                    else
                    {
                        oSB.Append(">");
                    }
                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:

                    if (oChunk.iParams > 0)
                    {
                        oSB.AppendFormat("<{0}", oChunk.sTag);
                        goto PrintParams;
                    }
                    else
                    {
                        if (oChunk.bEndClosure)
                        {
                            oSB.AppendFormat("<{0}/>", oChunk.sTag);
                        }
                        else
                        {
                            oSB.AppendFormat("</{0}>", oChunk.sTag);
                        }
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    oSB.AppendFormat(oChunk.oHTML);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:
                    if (!oP.bAutoExtractBetweenTagsOnly)
                    {
                        oSB.AppendFormat("{0}", oChunk.oHTML);
                    }
                    else
                    {
                        oSB.AppendFormat("<!--{0}-->", oChunk.oHTML);
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0)
                    {
                        continue;
                    }

                    oSB.AppendFormat("{0}", oChunk.bEntities ? oP.ChangeToEntities(oChunk.oHTML) : oChunk.oHTML);
                    break;
                }
                ;
            }

            // now compare parsed HTML with the one we expect
            Assert.AreEqual(sExpectedHTML, oSB.ToString());
        }

C# (CSharp) HTMLparser.Init 예제들