예제 #1
0
        /// <summary>
        /// Scan HTML source, reporting lexical events.
        /// </summary>
        /// <param name="r">Reader that provides characters</param>
        /// <param name="h">ScanHandler that accepts lexical events.</param>
        public virtual void Scan(TextReader r, IScanHandler h)
        {
            theState = S_PCDATA;

            int firstChar = r.Peek();   // Remove any leading BOM

            if (firstChar == '\uFEFF')
            {
                r.Read();
            }

            while (theState != S_DONE)
            {
                int  ch     = r.Peek();
                bool unread = false;

                // Process control characters
                if (ch >= 0x80 && ch <= 0x9F)
                {
                    ch = theWinMap[ch - 0x80];
                }

                if (ch == '\r')
                {
                    r.Read();
                    ch = r.Peek();      // expect LF next
                    if (ch != '\n')
                    {
                        unread = true;
                        ch     = '\n';
                    }
                }

                if (ch == '\n')
                {
                    theCurrentLine++;
                    theCurrentColumn = 0;
                }
                else
                {
                    theCurrentColumn++;
                }

                if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1))
                {
                    continue;
                }

                // Search state table
                int adjCh         = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2;
                int statetableRow = statetableIndex[theState][adjCh + 2];
                int action        = 0;
                if (statetableRow != -1)
                {
                    action       = statetable[statetableRow + 2];
                    theNextState = statetable[statetableRow + 3];
                }

                //			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
                switch (action)
                {
                case 0:
                    throw new Exception(
                              "HTMLScanner can't cope with " + (int)ch + " in state " +
                              (int)theState);

                case A_ADUP:
                    h.Adup(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_ADUP_SAVE:
                    h.Adup(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    Save(ch, h);
                    break;

                case A_ADUP_STAGC:
                    h.Adup(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    h.STagC(theOutputBuffer, 0, theSize);
                    break;

                case A_ANAME:
                    h.Aname(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_ANAME_ADUP:
                    h.Aname(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    h.Adup(theOutputBuffer, 0, theSize);
                    break;

                case A_ANAME_ADUP_STAGC:
                    h.Aname(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    h.Adup(theOutputBuffer, 0, theSize);
                    h.STagC(theOutputBuffer, 0, theSize);
                    break;

                case A_AVAL:
                    h.Aval(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_AVAL_STAGC:
                    h.Aval(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    h.STagC(theOutputBuffer, 0, theSize);
                    break;

                case A_CDATA:
                    Mark();
                    // suppress the final "]]" in the buffer
                    if (theSize > 1)
                    {
                        theSize -= 2;
                    }
                    h.PCDATA(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_ENTITY_START:
                    h.PCDATA(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    Save(ch, h);
                    break;

                case A_ENTITY:
                    Mark();
                    char ch1 = (char)ch;
                    //				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
                    if (theState == S_ENT && ch1 == '#')
                    {
                        theNextState = S_NCR;
                        Save(ch, h);
                        break;
                    }
                    else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X'))
                    {
                        theNextState = S_XNCR;
                        Save(ch, h);
                        break;
                    }
                    else if (theState == S_ENT && char.IsLetterOrDigit(ch1))
                    {
                        Save(ch, h);
                        break;
                    }
                    else if (theState == S_NCR && char.IsDigit(ch1))
                    {
                        Save(ch, h);
                        break;
                    }
                    else if (theState == S_XNCR && (char.IsDigit(ch1) || "abcdefABCDEF".IndexOf(ch1) != -1))
                    {
                        Save(ch, h);
                        break;
                    }

                    // The whole entity reference has been collected
                    //				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
                    h.Entity(theOutputBuffer, 1, theSize - 1);
                    int ent = h.GetEntity();
                    //				System.err.println("%% value = " + ent);
                    if (ent != 0)
                    {
                        theSize = 0;
                        if (ent >= 0x80 && ent <= 0x9F)
                        {
                            ent = theWinMap[ent - 0x80];
                        }
                        if (ent < 0x20)
                        {
                            // Control becomes space
                            ent = 0x20;
                        }
                        else if (ent >= 0xD800 && ent <= 0xDFFF)
                        {
                            // Surrogates get dropped
                            ent = 0;
                        }
                        else if (ent <= 0xFFFF)
                        {
                            // BMP character
                            Save(ent, h);
                        }
                        else
                        {
                            // Astral converted to two surrogates
                            ent -= 0x10000;
                            Save((ent >> 10) + 0xD800, h);
                            Save((ent & 0x3FF) + 0xDC00, h);
                        }
                        if (ch != ';')
                        {
                            unread = true;
                            theCurrentColumn--;
                        }
                    }
                    else
                    {
                        unread = true;
                        theCurrentColumn--;
                    }
                    theNextState = S_PCDATA;
                    break;

                case A_ETAG:
                    h.ETag(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_DECL:
                    h.Decl(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_GI:
                    h.GI(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_GI_STAGC:
                    h.GI(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    h.STagC(theOutputBuffer, 0, theSize);
                    break;

                case A_LT:
                    Mark();
                    Save('<', h);
                    Save(ch, h);
                    break;

                case A_LT_PCDATA:
                    Mark();
                    Save('<', h);
                    h.PCDATA(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_PCDATA:
                    Mark();
                    h.PCDATA(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_CMNT:
                    Mark();
                    h.Cmnt(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_MINUS3:
                    Save('-', h);
                    Save(' ', h);
                    break;

                case A_MINUS2:
                    Save('-', h);
                    Save(' ', h);
                    Save('-', h);
                    Save(ch, h);
                    // fall through into A_MINUS
                    break;

                case A_MINUS:
                    Save('-', h);
                    Save(ch, h);
                    break;

                case A_PI:
                    Mark();
                    h.PI(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_PITARGET:
                    h.PITarget(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_PITARGET_PI:
                    h.PITarget(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    h.PI(theOutputBuffer, 0, theSize);
                    break;

                case A_SAVE:
                    Save(ch, h);
                    break;

                case A_SKIP:
                    break;

                case A_SP:
                    Save(' ', h);
                    break;

                case A_STAGC:
                    h.STagC(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                case A_EMPTYTAG:
                    Mark();
                    //				System.err.println("%%% Empty tag seen");
                    if (theSize > 0)
                    {
                        h.GI(theOutputBuffer, 0, theSize);
                    }
                    theSize = 0;
                    h.STagE(theOutputBuffer, 0, theSize);
                    break;

                case A_UNGET:
                    unread = true;
                    theCurrentColumn--;
                    break;

                case A_UNSAVE_PCDATA:
                    if (theSize > 0)
                    {
                        theSize--;
                    }
                    h.PCDATA(theOutputBuffer, 0, theSize);
                    theSize = 0;
                    break;

                default:
                    throw new Exception("Can't process state " + action);
                }
                if (!unread)
                {
                    r.Read();
                }
                theState = theNextState;
            }
            h.EOF(theOutputBuffer, 0, 0);
        }
예제 #2
0
        public virtual void Scan(TextReader br, IScanHandler h)
        {
            string s;

            char[] buff   = null;
            bool   instag = false;

            while ((s = br.ReadLine()) != null)
            {
                int size = s.Length;
                buff = s.ToCharArray(0, size);
                if (buff.Length < size)
                {
                    buff = new char[size];
                }
                switch (buff[0])
                {
                case '(':
                    if (instag)
                    {
                        h.STagC(buff, 0, 0);
                        //instag = false; // LUCENENET: IDE0059: Remove unnecessary value assignment
                    }
                    h.GI(buff, 1, size - 1);
                    instag = true;
                    break;

                case ')':
                    if (instag)
                    {
                        h.STagC(buff, 0, 0);
                        instag = false;
                    }
                    h.ETag(buff, 1, size - 1);
                    break;

                case '?':
                    if (instag)
                    {
                        h.STagC(buff, 0, 0);
                        instag = false;
                    }
                    h.PI(buff, 1, size - 1);
                    break;

                case 'A':
                    int sp = s.IndexOf(' ');
                    h.Aname(buff, 1, sp - 1);
                    h.Aval(buff, sp + 1, size - sp - 1);
                    break;

                case '-':
                    if (instag)
                    {
                        h.STagC(buff, 0, 0);
                        instag = false;
                    }
                    if (s.Equals("-\\n", StringComparison.Ordinal))
                    {
                        buff[0] = '\n';
                        h.PCDATA(buff, 0, 1);
                    }
                    else
                    {
                        // FIXME:
                        // Does not decode \t and \\ in input
                        h.PCDATA(buff, 1, size - 1);
                    }
                    break;

                case 'E':
                    if (instag)
                    {
                        h.STagC(buff, 0, 0);
                        instag = false;
                    }
                    h.Entity(buff, 1, size - 1);
                    break;

                default:
                    //				System.err.print("Gotcha ");
                    //				System.err.print(s);
                    //				System.err.print('\n');
                    break;
                }
            }
            h.EOF(buff, 0, 0);
        }