Parsed HTML token that is either text, comment, script, open or closed tag as indicated by the oType variable.
Inheritance: IDisposable
示例#1
0
        /// <summary>
        /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here
        /// </summary>
        /// <param name="oP">Parser object</param>
        void BenchMarkParse(HTMLparser oP)
        {
            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    break;

                // matched normal text
                case HTMLchunkType.Text:
                    break;

                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:
                    break;
                }
                ;
            }
        }
示例#2
0
        static XElement FindParentOfNewNode(Majestic12.HTMLchunk m12chunk, string originalHtml, XElement nextPotentialParent)
        {
            string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml);

            XElement discoveredParent = null;

            // Get a list of all ancestors
            List <XElement> ancestors = new List <XElement>();
            XElement        ancestor  = nextPotentialParent;

            while (ancestor != null)
            {
                ancestors.Add(ancestor);
                ancestor = ancestor.Parent;
            }

            // Check if the new tag implies a previous tag was closed.
            if ("form" == m12chunkCleanedTag)
            {
                discoveredParent = ancestors
                                   .Where(XE => m12chunkCleanedTag == XE.Name)
                                   .Take(1)
                                   .Select(XE => XE.Parent)
                                   .FirstOrDefault();
            }
            else if ("td" == m12chunkCleanedTag)
            {
                discoveredParent = ancestors
                                   .TakeWhile(XE => "tr" != XE.Name)
                                   .Where(XE => m12chunkCleanedTag == XE.Name)
                                   .Take(1)
                                   .Select(XE => XE.Parent)
                                   .FirstOrDefault();
            }
            else if ("tr" == m12chunkCleanedTag)
            {
                discoveredParent = ancestors
                                   .TakeWhile(XE => !("table" == XE.Name ||
                                                      "thead" == XE.Name ||
                                                      "tbody" == XE.Name ||
                                                      "tfoot" == XE.Name))
                                   .Where(XE => m12chunkCleanedTag == XE.Name)
                                   .Take(1)
                                   .Select(XE => XE.Parent)
                                   .FirstOrDefault();
            }
            else if ("thead" == m12chunkCleanedTag ||
                     "tbody" == m12chunkCleanedTag ||
                     "tfoot" == m12chunkCleanedTag)
            {
                discoveredParent = ancestors
                                   .TakeWhile(XE => "table" != XE.Name)
                                   .Where(XE => m12chunkCleanedTag == XE.Name)
                                   .Take(1)
                                   .Select(XE => XE.Parent)
                                   .FirstOrDefault();
            }

            return(discoveredParent ?? nextPotentialParent);
        }
示例#3
0
        /// <summary>
        /// Internally parses tag and returns it from point when '<' was found
        /// </summary>
        /// <returns>Chunk</returns>
        HTMLchunk GetNextTag()
        {
            //iCurPos++;

            oChunk = oTP.ParseTag(ref iCurPos);

            // for backwards compatibility mark closed tags with params as open
            if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag)
            {
                oChunk.oType = HTMLchunkType.OpenTag;
            }

            //                    012345
            // check for start of script
            if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script")
            {
                if (!oChunk.bClosure)
                {
                    oChunk.oType = HTMLchunkType.Script;
                    oChunk       = oTP.ParseScript(ref iCurPos);
                    return(oChunk);
                }
            }

            oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

            if (bKeepRawHTML)
            {
                oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
            }

            return(oChunk);
        }
示例#4
0
        private void HandleCloseTag( HTMLchunk oChunk, ref int state )
        {
            if (   state == 2  )
            {
                state += 1;
            }
            else if ( state ==4 )
            {
                state = 5;
            }

            else if ( oChunk.sTag == "ol" && ( state == 9 ) )
            {
                state = 10;
            }
            else if ( oChunk.sTag == "table" && state == 12 )
            {
                state = 13;
            }
            else if ( oChunk.sTag == "div" && ( state==6 || state == 14 || state == 16 ) )
            {
                if(--divCount==0)
                {
                    state +=1;
                }
            }
        }
示例#5
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                if (oChunk != null)
                {
                    oChunk.Dispose();
                    oChunk = null;
                }

                if (sText != null)
                {
                    sText.Dispose();
                    sText = null;
                }

                bHTML = null;

                if (oE != null)
                {
                    oE.Dispose();
                    oE = null;
                }

                if (oTP != null)
                {
                    oTP.Dispose();
                    oTP = null;
                }
            }
        }
示例#6
0
 private void HandleCloseTag(HTMLchunk oChunk, ref int state)
 {
     if(oChunk.sTag=="ol")
     {
         state = 0;
     }
     else if(oChunk.sTag== "a")
     {
         if (state == 5 || state == 9 || state == 11)
             state += 1;
     }
 }
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                bHTML  = null;
                oChunk = null;
                sText  = null;
                oE     = null;
                oP     = null;
            }
        }
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE)
        {
            oP          = p_oP;
            oChunk      = p_oChunk;
            sText       = p_sText;
            bHTML       = p_bHTML;
            iDataLength = p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS;

            oE  = p_oE;
            oHE = p_oHE;
        }
示例#9
0
        private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            // if encoding already set then we should not be trying to set new one
            // this is the logic that major browsers follow - the first Encoding is assumed to be
            // the correct one
            if (bEncodingSet)
                return;

            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
            {
                if (!bEncodingSet)
                    Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML());
            }
        }
示例#10
0
        private void HandleCloseTag(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.sTag == "tr")
            {
                state = 0;
            }
            else if (oChunk.sTag == "td")
            {
                state = 1;
            }
            else if (oChunk.sTag == "a")
            {
                if (state == 4 || state == 6 || state == 8)
                    state += 1;

            }
        }
示例#11
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oChunk">Chunk</param>
        void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            // if encoding already set then we should not be trying to set new one
            // this is the logic that major browsers follow - the first Encoding is assumed to be
            // the correct one
            if (bEncodingSet)
            {
                return;
            }

            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
            {
                if (!bEncodingSet)
                {
                    Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML());
                }
            }
        }
示例#12
0
        private void HandleCloseTag(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.sTag == "a" && state == 6)
            {
                state += 1;
            }
            else if (oChunk.sTag == "li" && state == 7)
            {
                state = 4;
                if (item.Url != null && item.Url != "")
                {
                    searchResult.Results.Add(item);
                    item = new SearchEngineResult.ResultItem();
                }

            }
            else if (oChunk.sTag == "ul" && state == 4)
            {
                state = -1;
            }
        }
示例#13
0
        private HTMLchunk[] htmlParse(string str)
        {
            //return value
            ArrayList ret = new ArrayList();

            //init parser
            Majestic12.HTMLparser parser = new Majestic12.HTMLparser();

            //keep raw html because we need to reconstruct it
            parser.bKeepRawHTML = true;
            //keep text... this is for parsing just tags
            parser.bTextMode = true;
            //initialize to parse the string
            parser.Init(str);

            Majestic12.HTMLchunk chunk = null;
            // we parse until returned chunk is null indicating we reached end of parsing
            while ((chunk = parser.ParseNext()) != null)
            {
                //discard empty blocks for performance increase
                if (chunk.oHTML.Trim() != "")
                {
                    //hard copy the chunk
                    HTMLchunk clone = new HTMLchunk(false);
                    clone.oHTML = String.Copy(chunk.oHTML);
                    clone.oType = chunk.oType;
                    clone.sTag  = String.Copy(chunk.sTag);

                    ret.Add(clone);
                }
            }

            parser.CleanUp();

            //return string array
            return((HTMLchunk[])ret.ToArray(typeof(HTMLchunk)));
        }
示例#14
0
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {

                        default:
                            if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2)
                            {
                                state = 3;
                                if (item.Url != null && item.Url != "")
                                {
                                    searchResult.Results.Add(item);
                                    item = new SearchEngineResult.ResultItem();
                                    //item.Source = "Sogou";
                                }
                            }
                            else if (oChunk.sParams[i] == "href")
                            {
                                if (state == 4)
                                {
                                    item.Url = oChunk.sValues[i];
                                }
                                else if (state == 7 )
                                {
                                    item.CacheUrl = oChunk.sValues[i];
                                }else if(state==10)
                                {
                                    item.SimilarUrl = oChunk.sValues[i];
                                }

                            }
                            else if (oChunk.sParams[i] == "id" && (state == 6 || state == 9))
                            {
                                if (oChunk.sValues[i].StartsWith("sogou_snapshot"))
                                {
                                    state = 7;
                                }
                                else if (oChunk.sValues[i].StartsWith("sogou_sis"))
                                {
                                    state = 10;
                                }
                            }
                            break;
                    }
                }

            }
        }
示例#15
0
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE)
        {
            oP=p_oP;
            oChunk=p_oChunk;
            sText=p_sText;
            bHTML=p_bHTML;
            iDataLength=p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS;

            oE=p_oE;
            oHE=p_oHE;
        }
示例#16
0
 public void StepBack(HTMLchunk chunk)
 {
     if (chunk == null)
         return;
     CurPos = chunk.ContentPosition;
     CurrentChunk = chunk;
 }
示例#17
0
        private void HandleParam( HTMLchunk oChunk, ref int state )
        {
            if ( oChunk.iParams > 0 )
            {
                for ( int i = 0; i < oChunk.iParams; i++ )
                {
                    switch ( oChunk.cParamChars[i] )
                    {
                        default:
                            if ( oChunk.sValues[i] == "main_right_left" && oChunk.sParams[i] == "id" && state == 0 )
                            {
                                state = 1;

                            }
                            else if ( oChunk.sValues[i] == "word" && oChunk.sParams[i] == "id" && state > 0 )
                            {
                                state = 2;
                            }
                            else if ( oChunk.sValues[i] == "pron" && oChunk.sParams[i] == "id" && state ==3 )
                            {
                                state = 4;
                            }
                            else if ( oChunk.sValues[i] == "exp_exp" && oChunk.sParams[i] == "id" && state>2 )
                            {
                                state =6;
                                divCount = 1;
                            }
                            else if ( oChunk.sValues[i] == "exp_eg" && oChunk.sParams[i] == "id" && state>2 )
                            {
                                state = 8;
                            }
                            else if ( oChunk.sValues[i] == "exp_tran" && oChunk.sParams[i] == "id" && state > 2 )
                            {
                                state = 11;
                            }
                            else if ( oChunk.sValues[i] == "exp_eee" && oChunk.sParams[i] == "id" && state > 2 )
                            {
                                state = 14;
                                divCount = 1;
                            }
                            else if ( oChunk.sValues[i] == "exp_baike" && oChunk.sParams[i] == "id" && state > 2 )
                            {
                                state = 16;
                                divCount = 1;
                            }

                            break;
                    }
                }

            }
        }
示例#18
0
        private HTMLchunk[] htmlParse(string str)
        {
            //return value
            ArrayList ret = new ArrayList();

            //init parser
            Majestic12.HTMLparser parser = new Majestic12.HTMLparser();

            //keep raw html because we need to reconstruct it
            parser.bKeepRawHTML = true;
            //keep text... this is for parsing just tags
            parser.bTextMode = true;
            //initialize to parse the string
            parser.Init(str);

            Majestic12.HTMLchunk chunk = null;
            // we parse until returned chunk is null indicating we reached end of parsing
            while ((chunk = parser.ParseNext()) != null)
            {

                //discard empty blocks for performance increase
                if (chunk.oHTML.Trim() != "")
                {
                    //hard copy the chunk
                    HTMLchunk clone = new HTMLchunk(false);
                    clone.oHTML = String.Copy(chunk.oHTML);
                    clone.oType = chunk.oType;
                    clone.sTag = String.Copy(chunk.sTag);

                    ret.Add(clone);
                }
            }

            parser.CleanUp();

            //return string array
            return (HTMLchunk[])ret.ToArray(typeof(HTMLchunk));
        }
示例#19
0
        /// <summary>
        ///     Cleans up parser in preparation for next parsing
        /// </summary>
        public void CleanUp()
        {
            if (Entities == null)
                InitEntities();

            HtmlBytes = null;
            CurrentChunk = new HTMLchunk(true);
            CurPos = 0;
            DataLength = 0;
        }
示例#20
0
 private void HandleOpenTag(HTMLchunk oChunk, ref int state)
 {
     if (oChunk.sTag == "div" && state == 0)
     {
         state = 1;
     }
     else if (oChunk.sTag == "ul" && state == 2)
     {
         state = 3;
     }
     else if (oChunk.sTag == "li" && state == 4)
     {
         state = 5;
     }
     else if (oChunk.sTag == "a" && state == 5)
     {
         state = 6;
     }
 }
示例#21
0
        private static string getMarkedUpSource(HTMLchunk[] chunks, Menees.DiffUtils.EditScript edits, bool isOlderVersion)
        {
            string[] str = new string[chunks.Length];
            //html encode the source so it wont render
            for (int i = 0; i < str.Length; i++) str[i] = System.Web.HttpUtility.HtmlEncode(chunks[i].oHTML);

            //get an iterator for the changes
            System.Collections.IEnumerator it = edits.GetEnumerator();

            while (it.MoveNext())
            {
                Menees.DiffUtils.Edit curr = (Menees.DiffUtils.Edit)it.Current;
                int start = (isOlderVersion ? curr.StartA : curr.StartB);
                switch (curr.Type)
                {
                    case Menees.DiffUtils.EditType.Change:
                        //changes are marked as deletes in older version and adds in newer version
                        str[start] = (isOlderVersion ? Tags.changeDelete : Tags.changeAdd) + str[start];
                        str[start + curr.Length] += Tags.close;
                        break;

                    case Menees.DiffUtils.EditType.Delete:
                        //deletes are marked in the older version
                        if (isOlderVersion)
                        {
                            str[start] = Tags.delete + str[start];
                            str[start + curr.Length] += Tags.close;
                        }
                        break;

                    case Menees.DiffUtils.EditType.Insert:
                        //Inserts are marked in the newer version
                        if (!isOlderVersion)
                        {
                            str[start] = Tags.add + str[start];
                            str[start + curr.Length] += Tags.close;
                        }
                        break;
                }
            }

            return String.Join("", str);
        }
示例#22
0
 private void HandleOpenTag(HTMLchunk oChunk, ref int state)
 {
     if (oChunk.sTag == "tr")
     {
         state = 1;
     }
     else if (oChunk.sTag == "td" && state > 0)
     {
         state = 2;
     }
     else if (oChunk.sTag == "a")
     {
         if (state == 3 || state == 5 || state == 7)
             state += 1;
         else if (state == 9)
         {
             state = 8;
         }
     }
 }
示例#23
0
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {

                        default:
                            if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2)
                            {
                                state = 3;
                                if (item.Url != null && item.Url != "")
                                {
                                    searchResult.Results.Add(item);
                                    item = new SearchEngineResult.ResultItem();
                                    //item.Source = "Baidu";
                                }
                            }
                            else if (oChunk.sParams[i] == "href")
                            {
                                if (state == 4)
                                {
                                    item.Url = oChunk.sValues[i];
                                }
                                else if (state == 6 || state == 8)
                                {
                                    if (oChunk.sValues[i].IndexOf("cache") != -1)
                                    {
                                        item.CacheUrl = oChunk.sValues[i];
                                    }
                                    else if (oChunk.sValues[i].StartsWith("s?cl=2"))
                                    {

                                        item.SimilarUrl = oChunk.sValues[i];
                                    }
                                }
                            }
                            break;
                    }
                }

            }
        }
示例#24
0
        static Regex _shortHtmlComment  = new Regex(@"^<!-.*->$");   // matches "<!-Extra_Images->"

        static XElement ParseTagNode(Majestic12.HTMLchunk m12chunk, string originalHtml, ref int xmlnsIndex)
        {
            if (string.IsNullOrEmpty(m12chunk.sTag))
            {
                if (m12chunk.sParams.Length > 0 && m12chunk.sParams[0].ToLower().Equals("doctype"))
                {
                    return(new XElement("doctype"));
                }

                if (_weirdTag.IsMatch(originalHtml))
                {
                    return(new XElement("REMOVED_weirdBlockParenthesisTag"));
                }

                if (_aspnetPrecompiled.IsMatch(originalHtml))
                {
                    return(new XElement("REMOVED_ASPNET_PrecompiledDirective"));
                }

                if (_shortHtmlComment.IsMatch(originalHtml))
                {
                    return(new XElement("REMOVED_ShortHtmlComment"));
                }

                // Nodes like "<br <br>" will end up with a m12chunk.sTag==""...  We discard these nodes.
                return(null);
            }

            string tagName = CleanupTagName(m12chunk.sTag, originalHtml);

            XElement result = new XElement(tagName);

            List <XAttribute> attributes = new List <XAttribute>();

            for (int i = 0; i < m12chunk.iParams; i++)
            {
                if (m12chunk.sParams[i] == "<!--")
                {
                    // an HTML comment was embedded within a tag.  This comment and its contents
                    // will be interpreted as attributes by Majestic-12... skip this attributes
                    for (; i < m12chunk.iParams; i++)
                    {
                        if (m12chunk.sTag == "--" || m12chunk.sTag == "-->")
                        {
                            break;
                        }
                    }

                    continue;
                }

                if (m12chunk.sParams[i] == "?" && string.IsNullOrEmpty(m12chunk.sValues[i]))
                {
                    continue;
                }

                string attributeName = m12chunk.sParams[i];

                if (!TryCleanupAttributeName(attributeName, ref xmlnsIndex, out attributeName))
                {
                    continue;
                }

                attributes.Add(new XAttribute(attributeName, m12chunk.sValues[i]));
            }

            // If attributes are duplicated with different values, we complain.
            // If attributes are duplicated with the same value, we remove all but 1.
            var duplicatedAttributes = attributes.GroupBy(A => A.Name).Where(G => G.Count() > 1);

            foreach (var duplicatedAttribute in duplicatedAttributes)
            {
                if (duplicatedAttribute.GroupBy(DA => DA.Value).Count() > 1)
                {
                    throw new Exception("Attribute value was given different values");
                }

                attributes.RemoveAll(A => A.Name == duplicatedAttribute.Key);
                attributes.Add(duplicatedAttribute.First());
            }

            result.Add(attributes);

            return(result);
        }
示例#25
0
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {
                        default:
                            if (oChunk.sValues[i] == "bodyContent" && oChunk.sParams[i] == "id" && state == 1)
                            {
                                state = 2;

                            }
                            else if (oChunk.sValues[i] == "mw-search-results" && oChunk.sParams[i] == "class" && state == 3)
                            {
                                state = 4;
                            }
                            else if (oChunk.sParams[i] == "href" && state == 6)
                            {
                                item.Url ="http://en.wikipedia.org"+ oChunk.sValues[i];
                            }
                            else if (oChunk.sParams[i] == "title" && state == 6)
                            {
                                item.Title = oChunk.sValues[i];
                            }

                            break;
                    }
                }

            }
        }
示例#26
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // lets get params and their values

                    // if hashmode is set then param/values are kept in Hashtable oChunk.oParams
                    // this makes parsing slower, so if you want the highest performance then you
                    // need to HashMode to false
                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                if (oChunk.sValues[i].Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", oChunk.sParams[i], oChunk.sValues[i]);
                                }
                                else
                                {
                                    Console.Write(" {0}", oChunk.sParams[i]);
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    Console.Write("Closed tag: " + oChunk.sTag);
                    break;

                // matched normal text
                case HTMLchunkType.Text:
                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;

                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    // Note: you need to call finalisation on the chunk as by default comments are
                    // not finalised for performance reasons - if you have made parser to keep raw
                    // HTML then you won't be needing to finalise it
                    if (!oP.bKeepRawHTML)
                    {
                        oChunk.Finalise();
                    }

                    Console.Write("Comment: " + oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
示例#27
0
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int iCurPos)
        {
            /*
             *  WARNING: this code was optimised for performance rather than for readability,
             *  so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML
             *
             *  This routine takes about 60% of CPU time, in theory its the best place to gain extra speed,
             *  but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post
             *  your changes for everyone to enjoy!
             *
             *
             * */

            //bool bWhiteSpaceHere=false;

            //bool bParamValue=false;
            byte cChar=0;
            byte cPeek=0;

            // if true it means we have parsed complete tag
            //bool bGotTag=false;

            //int iEqualIdx=0;

            // we reach this function immediately after tag's byte (<) was
            // detected, so we need to save it in order to keep correct HTML copy
            // oChunk.Append((byte)'<'); // (byte)'<'

            /*
            oChunk.bBuffer[0]=60;
            oChunk.iBufPos=1;
            oChunk.iHTMLen=1;
            */

            // initialise peeked char - this will point to the next after < character
            if(iCurPos<iDataLength)
            {
                cPeek=bHTML[iCurPos];

                // in case of comments ! must follow immediately after <
                if(cPeek==(byte)'!')
                {
                    if(iCurPos+2<iDataLength &&
                        bHTML[iCurPos+1]==(byte)'-' && bHTML[iCurPos+2]==(byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag="!--";
                        oChunk.oType=HTMLchunkType.Comment;
                        oChunk.bComments=true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos+=3;
                        bool bFullTag;
                        oChunk=ParseComments(ref iCurPos,out bFullTag);

                        oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset;

                        if(oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if(!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength);
                            else
                            {
                                oChunk.oHTML=GetString(oChunk.iChunkOffset+4,oChunk.iChunkLength-(bFullTag ? 7 : 4));
                            }

                        }

                        return oChunk;
                    }

                    // ok we might have here CDATA element of XML:
                    // ref: http://www.w3schools.com/xml/xml_cdata.asp
                    if(iCurPos+7<iDataLength &&
                        bHTML[iCurPos+1]==(byte)'[' &&
                        bHTML[iCurPos+2]==(byte)'C' &&
                        bHTML[iCurPos+3]==(byte)'D' &&
                        bHTML[iCurPos+4]==(byte)'A' &&
                        bHTML[iCurPos+5]==(byte)'T' &&
                        bHTML[iCurPos+6]==(byte)'A' &&
                        bHTML[iCurPos+7]==(byte)'['
                        )
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag="![CDATA[";
                        oChunk.oType=HTMLchunkType.Comment;
                        oChunk.bComments=true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos+=8;
                        bool bFullTag;
                        oChunk=ParseCDATA(ref iCurPos,out bFullTag);

                        oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset;

                        if(oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if(!oP.bAutoExtractBetweenTagsOnly)
                                oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength);
                            else
                            {
                                oChunk.oHTML=GetString(oChunk.iChunkOffset+4+5,
                                    oChunk.iChunkLength-(bFullTag ? 7+5 : 4+5));
                            }

                        }

                        return oChunk;
                    }

                }

            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                oChunk.oType=HTMLchunkType.OpenTag;
                // end of data... before it started
                return oChunk;
            }

            // tag ID, non-zero if matched by heuristics engine
            int iTagID=0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if(bEnableHeuristics && iCurPos<iMaxHeuDataLength)
            {
                // check if we have got closure of the tag
                if(cPeek==(byte)'/')
                {
                    oChunk.bClosure=true;
                    oChunk.bEndClosure=false;
                    oChunk.oType=HTMLchunkType.CloseTag;
                    iCurPos++;
                    cPeek=bHTML[iCurPos];
                }

                cChar=bHTML[iCurPos+1];

                // probability of having a match is very high (or so we expect)
                iTagID=oHE.MatchTag(cPeek,cChar);

                if(iTagID!=0)
                {
                    if(iTagID<0)
                    {
                        iTagID*=-1;
                        // single character tag
                        oChunk.sTag=oHE.GetString(iTagID);

                        // see if we got fully closed tag
                        if(cChar==(byte)'>')
                        {
                            iCurPos+=2;
                            goto ReturnChunk;
                        }

                        cPeek=cChar;
                        iCurPos++;

                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        byte cNextChar=bHTML[iCurPos+2];

                        if(cNextChar==(byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar);
                            iCurPos+=3;

                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if(cNextChar==(byte)' ')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar);
                            iCurPos+=2;

                            cPeek=cNextChar;

                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        byte[] bTag=oHE.GetStringData(iTagID);

                        if(iCurPos+bTag.Length+5>=iDataLength)
                            goto TagParsing;

                        // in a loop (and this is not an ideal solution, but still)
                        for(int i=2; i<bTag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if(bTag[i]!=bHTML[iCurPos+i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer
                        // word
                        cNextChar=bHTML[iCurPos+bTag.Length];

                        if(cNextChar==(byte)'>')
                        {
                            oChunk.sTag=oHE.GetString(iTagID);
                            iCurPos+=bTag.Length+1;

                            goto ReturnChunk;
                        }

                        if(cNextChar==(byte)' ')
                        {
                            cPeek=cNextChar;
                            oChunk.sTag=oHE.GetString(iTagID);
                            iCurPos+=bTag.Length;

                            goto AttributeParsing;
                        }

                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }

                }
            }

            TagParsing:

            sText.Clear();

            byte bCharType=0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while(cPeek!=0)
            {
                bCharType=bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if(bCharType==(byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                        cChar=0;

                    bCharType=bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if(bCharType==(byte)TagCharType.WhiteSpace)
                    {

                        while(iCurPos<iDataLength)
                        {
                            cChar=bHTML[iCurPos++];

                            bCharType=bTagCharTypes[cChar];
                            if(bCharType==(byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            break;
                        }

                        if(iCurPos>=iDataLength)
                            cChar=0;
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if(sText.iBufPos>0)
                    {
                        oChunk.sTag=sText.SetToStringASCII();

                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos];
                        else
                            cPeek=0;

                        break;
                    }

                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                        cChar=0;
                }

                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos];
                else
                    cPeek=0;

                // most likely we should have lower-cased ASCII char
                if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++]=cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // tag end - we did not have any params
                if(cChar==(byte)'>')
                {
                    if(sText.iBufPos>0)
                        oChunk.sTag=sText.SetToStringASCII();

                    if(!oChunk.bClosure)
                        oChunk.oType=HTMLchunkType.OpenTag;

                    return oChunk;
                }

                // closure of tag sign
                if(cChar==(byte)'/')
                {
                    oChunk.bClosure=true;
                    oChunk.bEndClosure=(sText.iBufPos>0);
                    oChunk.oType=HTMLchunkType.CloseTag;
                    continue;
                }

                // 03/08/08 XML support: ?xml tags - grrr
                if(cChar==(byte)'?')
                {
                    sText.bBuffer[sText.iBufPos++]=cChar;
                    continue;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if(bCharType>32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++]=bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // we might have namespace : sign here - all text before would have to be
                // saved as namespace and we will need to continue parsing actual tag
                if(bCharType==(byte)TagCharType.NameSpaceColon)
                {
                    // ok here we got a choice - we can just continue and treat the whole
                    // thing as a single tag with namespace stuff prefixed, OR
                    // we can separate first part into namespace and keep tag as normal
                    sText.bBuffer[sText.iBufPos++]=(byte)':';
                    continue;
                }

                // ok, we have got some other char - we break out to deal with it in attributes part
                break;

            }

            if(cPeek==0)
            {
                return oChunk;
            }

            // if true then equal sign was found
            //bool bEqualsSign=false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters

            AttributeParsing:

            string sAttrName;

            if(iTagID!=0)
            {

                // first, skip whitespace:
                if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    iCurPos++;

                    if(iCurPos>=iDataLength)
                        goto ReturnChunk;

                    cPeek=bHTML[iCurPos];

                    if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while(iCurPos<iDataLength)
                        {
                            cPeek=bHTML[iCurPos++];

                            if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                                continue;

                            break;
                        }

                        if(cPeek==(byte)'>')
                            goto ReturnChunk;

                        iCurPos--;

                        if(iCurPos>=iDataLength)
                            goto ReturnChunk;
                    }

                    if(iCurPos>=iDataLength)
                        goto ReturnChunk;

                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                int iAttrID=oHE.MatchAttr(cPeek,iTagID);

                if(iAttrID>0)
                {
                    byte[] bAttr=oHE.GetAttrData(iAttrID);

                    if(iCurPos+bAttr.Length+2>=iDataLength)
                        goto ActualAttributeParsing;

                    // in a loop (and this is not an ideal solution, but still)
                    for(int i=1; i<bAttr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if(bAttr[i]!=bHTML[iCurPos+i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }

                    byte cNextChar=bHTML[iCurPos+bAttr.Length];

                    // ok, we expect next symbol to be =
                    if(cNextChar==(byte)'=')
                    {
                        sAttrName=oHE.GetAttr(iAttrID);
                        iCurPos+=bAttr.Length+1;
                        cPeek=bHTML[iCurPos];

                        goto AttributeValueParsing;
                    }

                }

            }

            ActualAttributeParsing:

            sText.Clear();

            // doing exactly the same thing as in tag parsing
            while(cPeek!=0)
            {
                bCharType=bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if(bCharType==(byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                    {
                        cPeek=0;
                        break;
                    }

                    bCharType=bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if(bCharType==(byte)TagCharType.WhiteSpace)
                    {

                        while(iCurPos<iDataLength)
                        {
                            cChar=bHTML[iCurPos++];

                            bCharType=bTagCharTypes[cChar];
                            if(bCharType==(byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            //if(cChar==(byte)'>')
                            //	goto ReturnChunk;

                            //iCurPos--;
                            break;
                        }

                        if(iCurPos>=iDataLength)
                        {
                            cChar=0;
                            cPeek=0;
                            break;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if(sText.iBufPos>0)
                    {
                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos];
                        else
                            cPeek=0;

                        // ok, we have got attribute name and now we have got next char there

                        // most likely we have got = here  and then value
                        if(cPeek==(byte)'=')
                        {
                            //bEqualsSign=true;

                            // move forward one char
                            iCurPos++;

                            if(iCurPos<iDataLength)
                                cPeek=bHTML[iCurPos];
                            else
                                cPeek=0;

                            break;
                        }

                        // or we can have end of tag itself, doh!
                        if(cPeek==(byte)'>')
                        {
                            // move forward one char
                            iCurPos++;

                            if(sText.iBufPos>0)
                                oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');

                            if(!oChunk.bClosure)
                                oChunk.oType=HTMLchunkType.OpenTag;

                            return oChunk;
                        }

                        // closure
                        if(cPeek==(byte)'/')
                        {
                            oChunk.bClosure=true;
                            oChunk.bEndClosure=true;
                            oChunk.oType=HTMLchunkType.CloseTag;
                            continue;
                        }

                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');
                        sText.Clear();
                        goto AttributeParsing;
                    }

                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if(iCurPos<iDataLength)
                        cChar=bHTML[iCurPos++];
                    else
                        cChar=0;
                }

                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos];
                else
                    cPeek=0;

                // most likely we should have lower-cased ASCII char here
                if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++]=cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if(cChar==(byte)'=')
                {
                    //bEqualsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if(bCharType>32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++]=bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if(cChar==(byte)'>')
                {
                    if(sText.iBufPos>0)
                        oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');

                    if(!oChunk.bClosure)
                        oChunk.oType=HTMLchunkType.OpenTag;

                    return oChunk;
                }

                // closure of tag sign
                if(cChar==(byte)'/')
                {
                    oChunk.bClosure=true;
                    oChunk.bEndClosure=true;
                    oChunk.oType=HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                sText.bBuffer[sText.iBufPos++]=cChar;
                // oChunk.Append(cChar);
            }

            if(cPeek==0)
            {
                if(sText.iBufPos>0)
                    oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' ');

                if(!oChunk.bClosure)
                    oChunk.oType=HTMLchunkType.OpenTag;

                return oChunk;
            }

            sAttrName=sText.SetToStringASCII();

            AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            byte cQuotes=cPeek;

            int iValueStartOffset;

            // skip whitespace if any
            if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
            {
                iCurPos++;

                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos];
                else
                {
                    iValueStartOffset=iCurPos-1;
                    goto AttributeValueEnd;
                }

                //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                //if(cChar<=32 && bWhiteSpace[cChar]==1)
                if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                {

                    while(iCurPos<iDataLength)
                    {
                        cPeek=bHTML[iCurPos++];

                        if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                        //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                        {
                            //cPeek=bHTML[iCurPos];
                            continue;
                        }

                        iCurPos--;
                        break;
                    }

                    if(iCurPos>=iDataLength)
                    {
                        iValueStartOffset=iCurPos-1;
                        goto AttributeValueEnd;
                    }
                }

                cQuotes=cPeek;
            }

            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if(cPeek!='\"' && cPeek!='\'')
            {
                iValueStartOffset=iCurPos;

                cQuotes=(byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos++];
                else
                {
                    goto AttributeValueEnd;
                }

                while(cPeek!=0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace)
                    {
                        oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' ');
                        iCurPos--;
                        goto AttributeParsing;
                    }

                    // end of tag?
                    if(cPeek==(byte)'>')
                    {
                        //iCurPos--;
                        break;
                    }

                    if(iCurPos<iDataLength)
                        cPeek=bHTML[iCurPos++];
                    else
                    {
                        iCurPos=iDataLength+1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            iCurPos++;

            iValueStartOffset=iCurPos;

            if(iCurPos<iDataLength)
                cPeek=bHTML[iCurPos++];
            else
            {

                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while(cPeek!=0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if(cPeek==38)
                {
                    int iPrevPos=iCurPos;

                    char cEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength);

                    // restore current symbol
                    if(cEntityChar==0)
                    {
                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos++];
                        else
                            break;

                        //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        int iPreEntLen=iPrevPos-iValueStartOffset-1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        sText.Clear();

                        // copy previous data
                        if(iPreEntLen>0)
                        {
                            Array.Copy(bHTML,iValueStartOffset,sText.bBuffer,0,iPreEntLen);
                            sText.iBufPos=iPreEntLen;
                        }

                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        oChunk.bEntities=true;

                        if(cChar==(byte)'<')
                            oChunk.bLtEntity=true;

                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if(cChar!=' ')
                        sText.Append(cEntityChar);

                        if(iCurPos<iDataLength)
                            cPeek=bHTML[iCurPos++];
                        else
                        {

                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while(cPeek!=0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if(cPeek==38)
                            {
                                char cNewEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength);

                                // restore current symbol
                                if(cNewEntityChar!=0)
                                {
                                    if(cNewEntityChar==(byte)'<')
                                        oChunk.bLtEntity=true;

                                    sText.Append(cNewEntityChar);

                                    if(iCurPos<iDataLength)
                                        cPeek=bHTML[iCurPos++];
                                    else
                                        goto AttributeValueEnd;

                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if(cPeek==cQuotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes);

                                if(iCurPos<iDataLength)
                                    cPeek=bHTML[iCurPos];
                                else
                                    break;

                                goto AttributeParsing;
                            }

                            sText.bBuffer[sText.iBufPos++]=cPeek;
                            //sText.Append(cPeek);

                            if(iCurPos<iDataLength)
                                cPeek=bHTML[iCurPos++];
                            else
                                break;
                        }

                        oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if(cPeek==cQuotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //sText.Clear();

                    oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),cQuotes);

                    if(iCurPos<iDataLength)
                        cPeek=bHTML[iCurPos];
                    else
                    {
                        //iCurPos++;
                        break;
                    }

                    goto AttributeParsing;
                }

                if(iCurPos<iDataLength)
                    cPeek=bHTML[iCurPos++];
                else
                {
                    //iCurPos++;
                    break;
                }
            }

            AttributeValueEnd:

            // ok we are done, add outstanding attribute
            int iLen=iCurPos-iValueStartOffset-1;
            if(iLen>0)
                oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iLen),cQuotes);
            else
                oChunk.AddParam(sAttrName,"",cQuotes);

            ReturnChunk:

            if(oChunk.bClosure)
            {
                oChunk.oType=HTMLchunkType.CloseTag;
            }
            else
                oChunk.oType=HTMLchunkType.OpenTag;

            return oChunk;
        }
示例#28
0
 /// <summary>
 /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk.
 /// </summary>
 /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that
 /// was initiated with the same HTML data that this chunk belongs to</param>
 public void SetRawHTML(HTMLchunk oChunk)
 {
     // note: this really should have been byte array assigned rather than string
     // it would be more correct originality-wise
     oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength);
 }
示例#29
0
        private static string getMarkedUpHtml(HTMLchunk[] chunks, Menees.DiffUtils.EditScript edits, bool isOlderVersion)
        {
            string[] str = new string[chunks.Length];
            for (int i = 0; i < str.Length; i++) str[i] = chunks[i].oHTML;

            //get an iterator for the changes
            System.Collections.IEnumerator it = edits.GetEnumerator();

            //for now only mark up text nodes!!! this needs improvement
            while (it.MoveNext())
            {
                Menees.DiffUtils.Edit curr = (Menees.DiffUtils.Edit)it.Current;
                int start = (isOlderVersion ? curr.StartA : curr.StartB);
                switch (curr.Type)
                {
                    case Menees.DiffUtils.EditType.Change:
                        for (int i = 0; i < curr.Length; i++)
                            if (chunks[start + i].oType == HTMLchunkType.Text)
                                str[start + i] = (isOlderVersion ? Tags.changeDelete : Tags.changeAdd) + str[start + i] + Tags.close;
                        break;

                    case Menees.DiffUtils.EditType.Delete:
                        //deletes are marked in the older version
                        if (isOlderVersion)
                            for (int i = 0; i < curr.Length; i++)
                                if (chunks[start + i].oType == HTMLchunkType.Text)
                                    str[start + i] = Tags.delete + str[start + i] + Tags.close;
                        break;

                    case Menees.DiffUtils.EditType.Insert:
                        //Inserts are marked in the newer version
                        if (!isOlderVersion)
                            for (int i = 0; i < curr.Length; i++)
                                if (chunks[start + i].oType == HTMLchunkType.Text)
                                    str[start + i] = Tags.add + str[start + i] + Tags.close;
                        break;
                }
            }

            return String.Join("", str);
        }
示例#30
0
 private void HandleText(HTMLchunk oChunk, ref int state)
 {
     if (state == 5)
     {
         item.Title += oChunk.oHTML;
     }
     else if (state == 7)
     {
         item.Description += oChunk.oHTML;
     }
 }
示例#31
0
        private int[] hash(HTMLchunk[] chunks)
        {
            //return value
            int[] hash = new int[chunks.Length];

            //hash the chunks
            Menees.DiffUtils.StringHasher hasher = new Menees.DiffUtils.StringHasher(Menees.DiffUtils.HashType.CRC32, true, true, 0);
            for (int i = 0; i < chunks.Length; i++)
                hash[i] = hasher.GetHashCode(chunks[i].oHTML);

            return hash;
        }
示例#32
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
                ;
            }

            return(false);
        }
示例#33
0
        private static string getKewords(HTMLchunk[] chunks, Menees.DiffUtils.EditScript edits)
        {
            System.Text.StringBuilder sb = new System.Text.StringBuilder();

            //get an iterator for the changes
            System.Collections.IEnumerator it = edits.GetEnumerator();
            while (it.MoveNext())
            {

                Menees.DiffUtils.Edit curr = (Menees.DiffUtils.Edit)it.Current;
                //append only new text additions to versionB
                if (curr.Type == EditType.Insert || curr.Type == EditType.Change)
                    for (int i = 0; i < curr.Length; i++)
                        //append only text changes
                        if (chunks[curr.StartB + i].oType == HTMLchunkType.Text)
                            sb.Append(" " + chunks[curr.StartB + i].oHTML);
            }

            return sb.ToString();
        }
        /// <summary>
        /// Internal: parses tag that started from current position
        /// </summary>
        /// <returns>HTMLchunk with tag information</returns>
        internal HTMLchunk ParseTag(ref int iCurPos)
        {
            /*
             *  WARNING: this code was optimised for performance rather than for readability,
             *  so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML
             *
             *  This routine takes about 60% of CPU time, in theory its the best place to gain extra speed,
             *  but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post
             *  your changes for everyone to enjoy!
             *
             *
             * */

            //bool bWhiteSpaceHere=false;

            //bool bParamValue=false;
            byte cChar = 0;
            byte cPeek = 0;

            // if true it means we have parsed complete tag
            //bool bGotTag=false;

            //int iEqualIdx=0;

            // we reach this function immediately after tag's byte (<) was
            // detected, so we need to save it in order to keep correct HTML copy
            // oChunk.Append((byte)'<'); // (byte)'<'

            /*
             * oChunk.bBuffer[0]=60;
             * oChunk.iBufPos=1;
             * oChunk.iHTMLen=1;
             */

            // initialise peeked char - this will point to the next after < character
            if (iCurPos < iDataLength)
            {
                cPeek = bHTML[iCurPos];

                // in case of comments ! must follow immediately after <
                if (cPeek == (byte)'!')
                {
                    if (iCurPos + 2 < iDataLength &&
                        bHTML[iCurPos + 1] == (byte)'-' && bHTML[iCurPos + 2] == (byte)'-')
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag      = "!--";
                        oChunk.oType     = HTMLchunkType.Comment;
                        oChunk.bComments = true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos += 3;
                        bool bFullTag;
                        oChunk = ParseComments(ref iCurPos, out bFullTag);

                        oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

                        if (oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if (!oP.bAutoExtractBetweenTagsOnly)
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength);
                            }
                            else
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset + 4, oChunk.iChunkLength - (bFullTag ? 7 : 4));
                            }
                        }

                        return(oChunk);
                    }

                    // ok we might have here CDATA element of XML:
                    // ref: http://www.w3schools.com/xml/xml_cdata.asp
                    if (iCurPos + 7 < iDataLength &&
                        bHTML[iCurPos + 1] == (byte)'[' &&
                        bHTML[iCurPos + 2] == (byte)'C' &&
                        bHTML[iCurPos + 3] == (byte)'D' &&
                        bHTML[iCurPos + 4] == (byte)'A' &&
                        bHTML[iCurPos + 5] == (byte)'T' &&
                        bHTML[iCurPos + 6] == (byte)'A' &&
                        bHTML[iCurPos + 7] == (byte)'['
                        )
                    {
                        // we detected start of comments here, instead of parsing the rest here we will
                        // call special function tuned to do the job much more effectively
                        oChunk.sTag      = "![CDATA[";
                        oChunk.oType     = HTMLchunkType.Comment;
                        oChunk.bComments = true;
                        // oChunk.Append((byte)'!');
                        // oChunk.Append((byte)'-');
                        // oChunk.Append((byte)'-');
                        iCurPos += 8;
                        bool bFullTag;
                        oChunk = ParseCDATA(ref iCurPos, out bFullTag);

                        oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset;

                        if (oP.bAutoKeepComments || oP.bKeepRawHTML)
                        {
                            if (!oP.bAutoExtractBetweenTagsOnly)
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength);
                            }
                            else
                            {
                                oChunk.oHTML = GetString(oChunk.iChunkOffset + 4 + 5,
                                                         oChunk.iChunkLength - (bFullTag ? 7 + 5 : 4 + 5));
                            }
                        }

                        return(oChunk);
                    }
                }
            }
            else
            {
                // empty tag but its not closed, so we will call it open...
                oChunk.oType = HTMLchunkType.OpenTag;
                // end of data... before it started
                return(oChunk);
            }

            // tag ID, non-zero if matched by heuristics engine
            int iTagID = 0;

            // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags
            // that should be present most of the time, this should save a lot of looping and string creation
            if (bEnableHeuristics && iCurPos < iMaxHeuDataLength)
            {
                // check if we have got closure of the tag
                if (cPeek == (byte)'/')
                {
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = false;
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    iCurPos++;
                    cPeek = bHTML[iCurPos];
                }

                cChar = bHTML[iCurPos + 1];

                // probability of having a match is very high (or so we expect)
                iTagID = oHE.MatchTag(cPeek, cChar);

                if (iTagID != 0)
                {
                    if (iTagID < 0)
                    {
                        iTagID *= -1;
                        // single character tag
                        oChunk.sTag = oHE.GetString(iTagID);

                        // see if we got fully closed tag
                        if (cChar == (byte)'>')
                        {
                            iCurPos += 2;
                            goto ReturnChunk;
                        }

                        cPeek = cChar;
                        iCurPos++;

                        // everything else means we need to continue scanning as we may have params and stuff
                        goto AttributeParsing;
                    }
                    else
                    {
                        // ok, we have here 2 or more character string that we need to check further
                        // often when we have full 2 char match the next char will be >, if that's the case
                        // then we definately matched our tag
                        byte cNextChar = bHTML[iCurPos + 2];

                        if (cNextChar == (byte)'>')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos    += 3;

                            goto ReturnChunk;
                        }

                        // ok, check next char for space, if that's the case we still got our tag
                        // but need to skip to attribute parsing
                        if (cNextChar == (byte)' ')
                        {
                            //oChunk.sTag=oHE.GetString(iTagID);
                            oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar);
                            iCurPos    += 2;

                            cPeek = cNextChar;


                            goto AttributeParsing;
                        }

                        // ok, we are not very lucky, but it is still worth fighting for
                        // now we need to check fully long string against what we have matched, maybe
                        // we got exact match and we can avoid full parsing of the tag
                        byte[] bTag = oHE.GetStringData(iTagID);

                        if (iCurPos + bTag.Length + 5 >= iDataLength)
                        {
                            goto TagParsing;
                        }

                        // in a loop (and this is not an ideal solution, but still)
                        for (int i = 2; i < bTag.Length; i++)
                        {
                            // if a single char is not matched, then we
                            if (bTag[i] != bHTML[iCurPos + i])
                            {
                                goto TagParsing;
                            }
                        }

                        // ok we matched full long word, but we need to be sure that char
                        // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer
                        // word
                        cNextChar = bHTML[iCurPos + bTag.Length];

                        if (cNextChar == (byte)'>')
                        {
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos    += bTag.Length + 1;

                            goto ReturnChunk;
                        }

                        if (cNextChar == (byte)' ')
                        {
                            cPeek       = cNextChar;
                            oChunk.sTag = oHE.GetString(iTagID);
                            iCurPos    += bTag.Length;

                            goto AttributeParsing;
                        }

                        // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o(
                    }
                }
            }

TagParsing:

            sText.Clear();

            byte bCharType = 0;

            // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute)
            while (cPeek != 0)
            {
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cChar = 0;
                    }

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                    {
                        while (iCurPos < iDataLength)
                        {
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            break;
                        }

                        if (iCurPos >= iDataLength)
                        {
                            cChar = 0;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got tag it means that we are most likely
                    // going to need to parse tag attributes
                    if (sText.iBufPos > 0)
                    {
                        oChunk.sTag = sText.SetToStringASCII();

                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }

                        break;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cChar = 0;
                    }
                }

                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos];
                }
                else
                {
                    cPeek = 0;
                }

                // most likely we should have lower-cased ASCII char
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                {
                    if (sText.iBufPos > 0)
                    {
                        oChunk.sTag = sText.SetToStringASCII();
                    }

                    if (!oChunk.bClosure)
                    {
                        oChunk.oType = HTMLchunkType.OpenTag;
                    }

                    return(oChunk);
                }

                // closure of tag sign
                if (cChar == (byte)'/')
                {
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = (sText.iBufPos > 0);
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    continue;
                }

                // 03/08/08 XML support: ?xml tags - grrr
                if (cChar == (byte)'?')
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    continue;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // we might have namespace : sign here - all text before would have to be
                // saved as namespace and we will need to continue parsing actual tag
                if (bCharType == (byte)TagCharType.NameSpaceColon)
                {
                    // ok here we got a choice - we can just continue and treat the whole
                    // thing as a single tag with namespace stuff prefixed, OR
                    // we can separate first part into namespace and keep tag as normal
                    sText.bBuffer[sText.iBufPos++] = (byte)':';
                    continue;
                }

                // ok, we have got some other char - we break out to deal with it in attributes part
                break;
            }

            if (cPeek == 0)
            {
                return(oChunk);
            }

            // if true then equal sign was found
            //bool bEqualsSign=false;

            // STAGE 2: parse attributes (if any available)
            // attribute name can be standalone or with value after =
            // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters

AttributeParsing:

            string sAttrName;

            if (iTagID != 0)
            {
                // first, skip whitespace:
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    // most likely next char is not-whitespace
                    iCurPos++;

                    if (iCurPos >= iDataLength)
                    {
                        goto ReturnChunk;
                    }

                    cPeek = bHTML[iCurPos];

                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        // ok long loop here then
                        while (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];

                            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                            {
                                continue;
                            }

                            break;
                        }

                        if (cPeek == (byte)'>')
                        {
                            goto ReturnChunk;
                        }

                        iCurPos--;

                        if (iCurPos >= iDataLength)
                        {
                            goto ReturnChunk;
                        }
                    }

                    if (iCurPos >= iDataLength)
                    {
                        goto ReturnChunk;
                    }
                }

                // ok we have got matched tag, it is possible that we might be able to quickly match
                // attribute name known to be used for that tag:
                int iAttrID = oHE.MatchAttr(cPeek, iTagID);

                if (iAttrID > 0)
                {
                    byte[] bAttr = oHE.GetAttrData(iAttrID);

                    if (iCurPos + bAttr.Length + 2 >= iDataLength)
                    {
                        goto ActualAttributeParsing;
                    }

                    // in a loop (and this is not an ideal solution, but still)
                    for (int i = 1; i < bAttr.Length; i++)
                    {
                        // if a single char is not matched, then we
                        if (bAttr[i] != bHTML[iCurPos + i])
                        {
                            goto ActualAttributeParsing;
                        }
                    }

                    byte cNextChar = bHTML[iCurPos + bAttr.Length];

                    // ok, we expect next symbol to be =
                    if (cNextChar == (byte)'=')
                    {
                        sAttrName = oHE.GetAttr(iAttrID);
                        iCurPos  += bAttr.Length + 1;
                        cPeek     = bHTML[iCurPos];

                        goto AttributeValueParsing;
                    }
                }
            }

ActualAttributeParsing:

            sText.Clear();

            // doing exactly the same thing as in tag parsing
            while (cPeek != 0)
            {
                bCharType = bTagCharTypes[cPeek];

                //if(cPeek<=32 && bWhiteSpace[cPeek]==1)
                if (bCharType == (byte)TagCharType.WhiteSpace)
                {
                    iCurPos++;

                    // speculative loop unroll -- we have a very good chance of seeing non-space char next
                    // so instead of setting up loop we will just read it directly, this should save ticks
                    // on having to prepare while() loop
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cPeek = 0;
                        break;
                    }

                    bCharType = bTagCharTypes[cChar];

                    //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                    //if(cChar<=32 && bWhiteSpace[cChar]==1)
                    if (bCharType == (byte)TagCharType.WhiteSpace)
                    {
                        while (iCurPos < iDataLength)
                        {
                            cChar = bHTML[iCurPos++];

                            bCharType = bTagCharTypes[cChar];
                            if (bCharType == (byte)TagCharType.WhiteSpace)
                            //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                            {
                                //cPeek=bHTML[iCurPos];
                                continue;
                            }

                            //if(cChar==(byte)'>')
                            //	goto ReturnChunk;

                            //iCurPos--;
                            break;
                        }

                        if (iCurPos >= iDataLength)
                        {
                            cChar = 0;
                            cPeek = 0;
                            break;
                        }
                    }

                    //bWhiteSpaceHere=true;

                    // now, if we have already got attribute name it means that we need to go to parse value (which may not be present)
                    if (sText.iBufPos > 0)
                    {
                        // oChunk.Append((byte)' ');

                        iCurPos--;

                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos];
                        }
                        else
                        {
                            cPeek = 0;
                        }

                        // ok, we have got attribute name and now we have got next char there

                        // most likely we have got = here  and then value
                        if (cPeek == (byte)'=')
                        {
                            //bEqualsSign=true;

                            // move forward one char
                            iCurPos++;

                            if (iCurPos < iDataLength)
                            {
                                cPeek = bHTML[iCurPos];
                            }
                            else
                            {
                                cPeek = 0;
                            }

                            break;
                        }

                        // or we can have end of tag itself, doh!
                        if (cPeek == (byte)'>')
                        {
                            // move forward one char
                            iCurPos++;

                            if (sText.iBufPos > 0)
                            {
                                oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                            }

                            if (!oChunk.bClosure)
                            {
                                oChunk.oType = HTMLchunkType.OpenTag;
                            }

                            return(oChunk);
                        }

                        // closure
                        if (cPeek == (byte)'/')
                        {
                            oChunk.bClosure    = true;
                            oChunk.bEndClosure = true;
                            oChunk.oType       = HTMLchunkType.CloseTag;
                            continue;
                        }

                        // ok, we have got new char starting after current attribute name is fully parsed
                        // this means the attribute name is on its own and the char we found is start
                        // of a new attribute
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                        sText.Clear();
                        goto AttributeParsing;
                    }
                }
                else
                {
                    // reuse Peeked char from previous run
                    //cChar=cPeek; iCurPos++;
                    if (iCurPos < iDataLength)
                    {
                        cChar = bHTML[iCurPos++];
                    }
                    else
                    {
                        cChar = 0;
                    }
                }

                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos];
                }
                else
                {
                    cPeek = 0;
                }

                // most likely we should have lower-cased ASCII char here
                if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit)
                {
                    sText.bBuffer[sText.iBufPos++] = cChar;
                    // oChunk.Append(cChar);
                    continue;
                }

                // = with attribute value to follow
                if (cChar == (byte)'=')
                {
                    //bEqualsSign=true;
                    break;
                }

                // nope, we have got upper cased ASCII char	- this seems to be LESS likely than > and /
                //if(cChar>=65 && cChar<=90)
                if (bCharType > 32)
                {
                    // bCharType in this case contains already lower-cased char
                    sText.bBuffer[sText.iBufPos++] = bCharType;
                    // oChunk.Append(bCharType);
                    continue;
                }

                // tag end - we did not have any params
                if (cChar == (byte)'>')
                {
                    if (sText.iBufPos > 0)
                    {
                        oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                    }

                    if (!oChunk.bClosure)
                    {
                        oChunk.oType = HTMLchunkType.OpenTag;
                    }

                    return(oChunk);
                }

                // closure of tag sign
                if (cChar == (byte)'/')
                {
                    oChunk.bClosure    = true;
                    oChunk.bEndClosure = true;
                    oChunk.oType       = HTMLchunkType.CloseTag;
                    continue;
                }

                // some other char
                sText.bBuffer[sText.iBufPos++] = cChar;
                // oChunk.Append(cChar);
            }

            if (cPeek == 0)
            {
                if (sText.iBufPos > 0)
                {
                    oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' ');
                }

                if (!oChunk.bClosure)
                {
                    oChunk.oType = HTMLchunkType.OpenTag;
                }

                return(oChunk);
            }

            sAttrName = sText.SetToStringASCII();

AttributeValueParsing:

            /// ***********************************************************************
            /// STAGE 3: parse attribute value
            /// ***********************************************************************

            // the value could be just string, or in quotes (single or double)
            // or we can have next attribute name start, in which case we will jump back to attribute parsing

            // for tracking quotes purposes
            byte cQuotes = cPeek;

            int iValueStartOffset;

            // skip whitespace if any
            if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
            {
                iCurPos++;

                // speculative loop unroll -- we have a very good chance of seeing non-space char next
                // so instead of setting up loop we will just read it directly, this should save ticks
                // on having to prepare while() loop
                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos];
                }
                else
                {
                    iValueStartOffset = iCurPos - 1;
                    goto AttributeValueEnd;
                }

                //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10)
                //if(cChar<=32 && bWhiteSpace[cChar]==1)
                if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                {
                    while (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos++];

                        if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                        //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
                        {
                            //cPeek=bHTML[iCurPos];
                            continue;
                        }

                        iCurPos--;
                        break;
                    }

                    if (iCurPos >= iDataLength)
                    {
                        iValueStartOffset = iCurPos - 1;
                        goto AttributeValueEnd;
                    }
                }

                cQuotes = cPeek;
            }



            // because we deal with VALUE of the attribute it means we can't lower-case it,
            // or skip whitespace (if in quotes), which in practice means that we don't need to copy
            // it to temporary string buffer, we can just remember starting offset and then create string from
            // data in bHTML

            // ok, first char can be one of the quote chars or something else
            if (cPeek != '\"' && cPeek != '\'')
            {
                iValueStartOffset = iCurPos;

                cQuotes = (byte)' ';
                // any other char here means we have value up until next whitespace or end of tag
                // this gives us good opportunity to scan fairly quickly without otherwise redundant
                // checks - this should happen fairly rarely, however loop dealing with data between quotes
                // will happen often enough and its best to eliminate as much stuff from it as possible
                //sText.bBuffer[sText.iBufPos++]=cPeek;

                // move to next char
                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos++];
                }
                else
                {
                    goto AttributeValueEnd;
                }

                while (cPeek != 0)
                {
                    // if whitespace then we got our value and need to go back to param
                    if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace)
                    {
                        oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');
                        iCurPos--;
                        goto AttributeParsing;
                    }

                    // end of tag?
                    if (cPeek == (byte)'>')
                    {
                        //iCurPos--;
                        break;
                    }

                    if (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos++];
                    }
                    else
                    {
                        iCurPos = iDataLength + 1;
                        goto AttributeValueEnd;
                    }
                }

                // ok we are done, add outstanding attribute
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' ');

                goto ReturnChunk;
            }

            // move one step forward
            iCurPos++;

            iValueStartOffset = iCurPos;

            if (iCurPos < iDataLength)
            {
                cPeek = bHTML[iCurPos++];
            }
            else
            {
                goto AttributeValueEnd;
            }

            // attribute value parsing from between two quotes
            while (cPeek != 0)
            {
                // check whether we have got possible entity (can be anything starting with &)
                if (cPeek == 38)
                {
                    int iPrevPos = iCurPos;

                    char cEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                    // restore current symbol
                    if (cEntityChar == 0)
                    {
                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];
                        }
                        else
                        {
                            break;
                        }

                        //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';;
                        continue;
                    }
                    else
                    {
                        // okay we have got an entity, our hope of not having to copy stuff into variable
                        // is over, we have to continue in a slower fashion :(
                        // but thankfully this should happen very rarely, so, annoying to code, but
                        // most codepaths will run very fast!
                        int iPreEntLen = iPrevPos - iValueStartOffset - 1;

                        // 14/05/08 need to clear text - it contains attribute name text
                        sText.Clear();

                        // copy previous data
                        if (iPreEntLen > 0)
                        {
                            Array.Copy(bHTML, iValueStartOffset, sText.bBuffer, 0, iPreEntLen);
                            sText.iBufPos = iPreEntLen;
                        }

                        // we have to skip now to next byte, since
                        // some converted chars might well be control chars like >
                        oChunk.bEntities = true;

                        if (cChar == (byte)'<')
                        {
                            oChunk.bLtEntity = true;
                        }

                        // unless is space we will ignore it
                        // note that this won't work if &nbsp; is defined as it should
                        // byte int value of 160, rather than 32.
                        //if(cChar!=' ')
                        sText.Append(cEntityChar);

                        if (iCurPos < iDataLength)
                        {
                            cPeek = bHTML[iCurPos++];
                        }
                        else
                        {
                            goto AttributeValueEnd;
                        }

                        // okay, we continue here using in effect new inside loop as we might have more entities here
                        // attribute value parsing from between two quotes
                        while (cPeek != 0)
                        {
                            // check whether we have got possible entity (can be anything starting with &)
                            if (cPeek == 38)
                            {
                                char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength);

                                // restore current symbol
                                if (cNewEntityChar != 0)
                                {
                                    if (cNewEntityChar == (byte)'<')
                                    {
                                        oChunk.bLtEntity = true;
                                    }

                                    sText.Append(cNewEntityChar);

                                    if (iCurPos < iDataLength)
                                    {
                                        cPeek = bHTML[iCurPos++];
                                    }
                                    else
                                    {
                                        goto AttributeValueEnd;
                                    }

                                    continue;
                                }
                            }

                            // check if is end of quotes
                            if (cPeek == cQuotes)
                            {
                                // ok we finished scanning it: add param with value and then go back to param name parsing
                                oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);

                                if (iCurPos < iDataLength)
                                {
                                    cPeek = bHTML[iCurPos];
                                }
                                else
                                {
                                    break;
                                }

                                goto AttributeParsing;
                            }

                            sText.bBuffer[sText.iBufPos++] = cPeek;
                            //sText.Append(cPeek);

                            if (iCurPos < iDataLength)
                            {
                                cPeek = bHTML[iCurPos++];
                            }
                            else
                            {
                                break;
                            }
                        }

                        oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes);
                        goto ReturnChunk;
                    }
                }

                // check if is end of quotes
                if (cPeek == cQuotes)
                {
                    // ok we finished scanning it: add param with value and then go back to param name parsing
                    //sText.Clear();

                    oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), cQuotes);


                    if (iCurPos < iDataLength)
                    {
                        cPeek = bHTML[iCurPos];
                    }
                    else
                    {
                        //iCurPos++;
                        break;
                    }

                    goto AttributeParsing;
                }

                if (iCurPos < iDataLength)
                {
                    cPeek = bHTML[iCurPos++];
                }
                else
                {
                    //iCurPos++;
                    break;
                }
            }

AttributeValueEnd:



            // ok we are done, add outstanding attribute
            int iLen = iCurPos - iValueStartOffset - 1;

            if (iLen > 0)
            {
                oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iLen), cQuotes);
            }
            else
            {
                oChunk.AddParam(sAttrName, "", cQuotes);
            }

ReturnChunk:

            if (oChunk.bClosure)
            {
                oChunk.oType = HTMLchunkType.CloseTag;
            }
            else
            {
                oChunk.oType = HTMLchunkType.OpenTag;
            }

            return(oChunk);
        }
示例#35
0
 public HTMLchunk PeakNext()
 {
     var currPos = CurPos;
     var currChunk = CurrentChunk;
     CurrentChunk = new HTMLchunk(true);
     var result = ParseNext();
     CurrentChunk = currChunk;
     CurPos = currPos;
     return result;
 }
示例#36
0
 private void HandleOpenTag(HTMLchunk oChunk, ref int state)
 {
     if (oChunk.sTag == "ol")
     {
         state = 1;
     }
     else if (oChunk.sTag == "li" && state > 0 )
     {
         state = 2;
     }
     else if(oChunk.sTag== "a")
     {
         if (state == 4 || state == 8 || state == 10)
             state += 1;
         /*else if (state == 9)
         {
             state = 8;
         }*/
     }
 }
示例#37
0
 private void HandleOpenTag( HTMLchunk oChunk, ref int state )
 {
     if ( oChunk.sTag == "ol" && state==8 )
     {
         state =9;
     }
     else if ( oChunk.sTag == "table" && state == 11 )
     {
         state = 12;
     }else if( oChunk.sTag== "div" && ( state==6 || state== 14 || state ==16) )
     {
         ++divCount;
     }
 }
示例#38
0
        private void Dispose(bool bDisposing)
        {
            if(!bDisposed)
            {
                bDisposed=true;

                bHTML=null;
                oChunk=null;
                sText=null;
                oE=null;
                oP=null;
            }
        }
示例#39
0
        private void HandleText( HTMLchunk oChunk, ref int state )
        {
            if ( state == 2 )
            {
                dictResult.Word += ( oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' }) );
            }
            else if ( state == 4 )
            {
                dictResult.Pronunciation += ( System.Web.HttpUtility.HtmlDecode( oChunk.oHTML ) );
            }
            else if ( state == 6 )
            {

                dictResult.ChineseExplanations += ( oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' }) );
            }

            else if ( state == 12 )
            {
                dictResult.Variations += ( oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) );
            }

            else if ( state == 14 )
            {
                dictResult.EnglishExplanations += ( oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) );
            }
            else if ( state == 9 )
            {
                dictResult.Examples += ( (oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) ));
            }
            else if ( state == 16 )
            {
                dictResult.FromEncyclopedia += ( oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) );
            }
        }
示例#40
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet = false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // in order to set correct encoding we need to keep an eye on META tags
                    // that hit us on what the encoding should be used, note here
                    // that some webpages have TITLE set BEFORE meta-tags, which means you will
                    // have to re-encode it in order to get correct representation of text

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());



                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        Console.Write(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        Console.Write(" {0}={1}", oChunk.sParams[i], oChunk.sValues[i]);
                                    }
                                    break;

                                default:
                                    Console.Write(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], oChunk.sValues[i]);
                                    break;
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    //Console.Write(oChunk.GenerateHTML());

                    Console.Write("Closed tag: " + oChunk.sTag);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    if (oChunk.oHTML.Length > 0)
                    {
                        Console.Write("Script: " + oChunk.oHTML);
                    }
                    else
                    {
                        Console.Write("Script: [ignored for performance reasons]");
                    }

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    //Console.WriteLine("{0}",oChunk.GenerateHTML());

                    if (oP.bKeepRawHTML || oP.bAutoKeepComments)
                    {
                        // by default we won't finalise automatically as comments are often
                        // very lenghty and it is costly to create long strings when they are not
                        // needed, ie: during indexing of text
                        Console.Write("Comment: " + oChunk.oHTML);
                    }
                    else
                    {
                        // Even if raw HTML by default was not taken you can get it anyway by
                        // uncommenting next line
                        //oP.SetRawHTML(oChunk);

                        Console.Write("Comment: [ignored for performance reasons]");
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0 && bReadLineDelay)
                    {
                        continue;
                    }

                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
示例#41
0
		/// <summary>
		/// Parses next chunk and returns it with 
		/// </summary>
		/// <param name="bKeepWhiteSpace">If true then whitespace will be preserved (slower)</param>
		/// <returns>HTMLchunk or null if end of data reached</returns>
		public HTMLchunk ParseNext(bool bKeepWhiteSpace)
		{
			oChunk.Clear();
			oChunk.oType=HTMLchunkType.Text;

			bool bWhiteSpace=false;
			byte cChar=0x00;

			while(true)
			{
				if(!bKeepWhiteSpace)
				{
					//bWhiteSpace=SkipWhiteSpace();

					bWhiteSpace=false;

					while(iCurPos<iDataLength)
					{
						cChar=bHTML[iCurPos++];

						if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10)
						{
							// we don't do anything because we found char that can be used down the pipeline
							// without need to look it up again
							//PutChar();
							//iCurPos--;
							goto WhiteSpaceDone;
						}
						else
							bWhiteSpace=true;
					}
					
					break;

				}
				else
				{
					cChar=NextChar();

					// we are definately done
					if(cChar==0)
						break;
				}
			
			WhiteSpaceDone:		

				switch((byte)cChar)
				{
						//case '<':
					case 60:


						// we may have found text bit before getting to the tag
						// in which case we need to put back tag byte and return
						// found text first, the tag will be parsed next time
						if(oChunk.iBufPos>0 || bWhiteSpace)
						{
							// we will add 1 white space chars to compensate for 
							// loss of space before tag since this space often serves as a delimiter between words
							if(bWhiteSpace)
								oChunk.Append(0x20);

							//PutChar();
							iCurPos--;

							// finalise chunk if text mode is not false
							if(bTextMode)
								oChunk.Finalise();

							return oChunk;
						}

						if(!bKeepRawHTML)
							return ParseTag(bKeepWhiteSpace);
						else
						{
							oChunk=ParseTag(bKeepWhiteSpace);

							oChunk.Finalise();

							return oChunk;
						}

						/*
						 * case 179:
							Console.WriteLine("Found: {0} in {1}!",(char)cChar,oChunk.oHTML.ToString());
							break;
							*/
						
					case 13:
						break;

					case 10:
						if(bKeepWhiteSpace)
						{
							/*
							if(oChunk==null)
							{
								oChunk=new HTMLchunk(false);
								oChunk.oType=HTMLchunkType.Text;
							}
							*/

							oChunk.Append(cChar);
						}
						break;

					default:

						/*
						if(oChunk==null)
						{
							oChunk=new HTMLchunk(false);
							oChunk.oType=HTMLchunkType.Text;
						}
						*/
						if(bTextMode)
						{

							// check if its entity
							if(cChar=='&')
							{
								cChar=(byte)CheckForEntity();

								// restore current symbol
								if(cChar==0)
									cChar=(byte)'&';
								else
								{
									oChunk.bEntities=true;

									if(cChar=='<')
										oChunk.bLtEntity=true;
								}
							}

							if(bReturnSplitWords)
							{
								if(bWhiteSpace)
								{
									if(oChunk.iBufPos>0)
									{
										//PutChar();
										iCurPos--;

										oChunk.Finalise();
										return oChunk;
									}
								}
								else
								{
									if(char.IsPunctuation((char)cChar))
									{
										if(oChunk.iBufPos>0)
										{
											//PutChar();
											oChunk.Finalise();
											return oChunk;
										}
										else
											break;
									}
								}
							}
							else
							{
								if(bWhiteSpace && bTextMode)
									oChunk.Append((byte)' ');
							}
				
							oChunk.Append(cChar);
						}
						
						break;
				};

			}

			if(oChunk.iBufPos==0)
				return null;

			// it will be null if we have not found any data

			if(bTextMode)
				oChunk.Finalise();

			return oChunk;
		}
示例#42
0
        private void HandleParam(HTMLchunk oChunk, ref int state)
        {
            if (oChunk.iParams > 0)
            {
                for (int i = 0; i < oChunk.iParams; i++)
                {
                    switch (oChunk.cParamChars[i])
                    {

                        default:
                            if (oChunk.sValues[i] == "g" && oChunk.sParams[i] == "class" && state == 2)
                            {
                                state = 3;
                                if (item.Url!=null && item.Url!="")
                                {
                                    searchResult.Results.Add(item);
                                    item = new SearchEngineResult.ResultItem();
                                    //item.Source = "Google";
                                }
                            }else if(oChunk.sValues[i] == "r" && oChunk.sParams[i] == "class" && state == 3)
                            {
                                state = 4;
                            }else if(oChunk.sValues[i] == "s" && oChunk.sParams[i] == "class" && state == 6)
                            {
                                state = 7;
                            }
                            else if (oChunk.sValues[i] == "gl" && oChunk.sParams[i] == "class" && state == 7)
                            {
                                state = 8;
                            }
                            else if (oChunk.sParams[i] == "href")
                            {
                                if (state == 5)
                                {
                                    item.Url = oChunk.sValues[i];
                                }
                                else if (state == 9 || state == 11)
                                {
                                    if (oChunk.sValues[i].IndexOf("q=related")!=-1)
                                    {
                                        item.SimilarUrl = oChunk.sValues[i];
                                    }
                                    else if (oChunk.sValues[i].IndexOf("q=cache") != -1)
                                    {
                                        item.CacheUrl = oChunk.sValues[i];
                                    }
                                }

                            }
                            break;
                    }
                }

            }
        }