/// <summary> /// Scan HTML source, reporting lexical events. /// </summary> /// <param name="r">Reader that provides characters</param> /// <param name="h">ScanHandler that accepts lexical events.</param> public virtual void Scan(TextReader r, IScanHandler h) { theState = S_PCDATA; int firstChar = r.Peek(); // Remove any leading BOM if (firstChar == '\uFEFF') { r.Read(); } while (theState != S_DONE) { int ch = r.Peek(); bool unread = false; // Process control characters if (ch >= 0x80 && ch <= 0x9F) { ch = theWinMap[ch - 0x80]; } if (ch == '\r') { r.Read(); ch = r.Peek(); // expect LF next if (ch != '\n') { unread = true; ch = '\n'; } } if (ch == '\n') { theCurrentLine++; theCurrentColumn = 0; } else { theCurrentColumn++; } if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) { continue; } // Search state table int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2; int statetableRow = statetableIndex[theState][adjCh + 2]; int action = 0; if (statetableRow != -1) { action = statetable[statetableRow + 2]; theNextState = statetable[statetableRow + 3]; } // System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); switch (action) { case 0: throw new Exception( "HTMLScanner can't cope with " + (int)ch + " in state " + (int)theState); case A_ADUP: h.Adup(theOutputBuffer, 0, theSize); theSize = 0; break; case A_ADUP_SAVE: h.Adup(theOutputBuffer, 0, theSize); theSize = 0; Save(ch, h); break; case A_ADUP_STAGC: h.Adup(theOutputBuffer, 0, theSize); theSize = 0; h.STagC(theOutputBuffer, 0, theSize); break; case A_ANAME: h.Aname(theOutputBuffer, 0, theSize); theSize = 0; break; case A_ANAME_ADUP: h.Aname(theOutputBuffer, 0, theSize); theSize = 0; h.Adup(theOutputBuffer, 0, theSize); break; case A_ANAME_ADUP_STAGC: h.Aname(theOutputBuffer, 0, theSize); theSize = 0; h.Adup(theOutputBuffer, 0, theSize); h.STagC(theOutputBuffer, 0, theSize); break; case A_AVAL: h.Aval(theOutputBuffer, 0, theSize); theSize = 0; break; case A_AVAL_STAGC: h.Aval(theOutputBuffer, 0, theSize); theSize = 0; h.STagC(theOutputBuffer, 0, theSize); break; case A_CDATA: Mark(); // suppress the final "]]" in the buffer if (theSize > 1) { theSize -= 2; } h.PCDATA(theOutputBuffer, 0, theSize); theSize = 0; break; case A_ENTITY_START: h.PCDATA(theOutputBuffer, 0, theSize); theSize = 0; Save(ch, h); break; case A_ENTITY: Mark(); char ch1 = (char)ch; // System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); if (theState == S_ENT && ch1 == '#') { theNextState = S_NCR; Save(ch, h); break; } else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { theNextState = S_XNCR; Save(ch, h); break; } else if (theState == S_ENT && char.IsLetterOrDigit(ch1)) { Save(ch, h); break; } else if (theState == S_NCR && char.IsDigit(ch1)) { Save(ch, h); break; } else if (theState == S_XNCR && (char.IsDigit(ch1) || "abcdefABCDEF".IndexOf(ch1) != -1)) { Save(ch, h); break; } // The whole entity reference has been collected // System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); h.Entity(theOutputBuffer, 1, theSize - 1); int ent = h.GetEntity(); // System.err.println("%% value = " + ent); if (ent != 0) { theSize = 0; if (ent >= 0x80 && ent <= 0x9F) { ent = theWinMap[ent - 0x80]; } if (ent < 0x20) { // Control becomes space ent = 0x20; } else if (ent >= 0xD800 && ent <= 0xDFFF) { // Surrogates get dropped ent = 0; } else if (ent <= 0xFFFF) { // BMP character Save(ent, h); } else { // Astral converted to two surrogates ent -= 0x10000; Save((ent >> 10) + 0xD800, h); Save((ent & 0x3FF) + 0xDC00, h); } if (ch != ';') { unread = true; theCurrentColumn--; } } else { unread = true; theCurrentColumn--; } theNextState = S_PCDATA; break; case A_ETAG: h.ETag(theOutputBuffer, 0, theSize); theSize = 0; break; case A_DECL: h.Decl(theOutputBuffer, 0, theSize); theSize = 0; break; case A_GI: h.GI(theOutputBuffer, 0, theSize); theSize = 0; break; case A_GI_STAGC: h.GI(theOutputBuffer, 0, theSize); theSize = 0; h.STagC(theOutputBuffer, 0, theSize); break; case A_LT: Mark(); Save('<', h); Save(ch, h); break; case A_LT_PCDATA: Mark(); Save('<', h); h.PCDATA(theOutputBuffer, 0, theSize); theSize = 0; break; case A_PCDATA: Mark(); h.PCDATA(theOutputBuffer, 0, theSize); theSize = 0; break; case A_CMNT: Mark(); h.Cmnt(theOutputBuffer, 0, theSize); theSize = 0; break; case A_MINUS3: Save('-', h); Save(' ', h); break; case A_MINUS2: Save('-', h); Save(' ', h); Save('-', h); Save(ch, h); // fall through into A_MINUS break; case A_MINUS: Save('-', h); Save(ch, h); break; case A_PI: Mark(); h.PI(theOutputBuffer, 0, theSize); theSize = 0; break; case A_PITARGET: h.PITarget(theOutputBuffer, 0, theSize); theSize = 0; break; case A_PITARGET_PI: h.PITarget(theOutputBuffer, 0, theSize); theSize = 0; h.PI(theOutputBuffer, 0, theSize); break; case A_SAVE: Save(ch, h); break; case A_SKIP: break; case A_SP: Save(' ', h); break; case A_STAGC: h.STagC(theOutputBuffer, 0, theSize); theSize = 0; break; case A_EMPTYTAG: Mark(); // System.err.println("%%% Empty tag seen"); if (theSize > 0) { h.GI(theOutputBuffer, 0, theSize); } theSize = 0; h.STagE(theOutputBuffer, 0, theSize); break; case A_UNGET: unread = true; theCurrentColumn--; break; case A_UNSAVE_PCDATA: if (theSize > 0) { theSize--; } h.PCDATA(theOutputBuffer, 0, theSize); theSize = 0; break; default: throw new Exception("Can't process state " + action); } if (!unread) { r.Read(); } theState = theNextState; } h.EOF(theOutputBuffer, 0, 0); }
public virtual void Scan(TextReader br, IScanHandler h) { string s; char[] buff = null; bool instag = false; while ((s = br.ReadLine()) != null) { int size = s.Length; buff = s.ToCharArray(0, size); if (buff.Length < size) { buff = new char[size]; } switch (buff[0]) { case '(': if (instag) { h.STagC(buff, 0, 0); //instag = false; // LUCENENET: IDE0059: Remove unnecessary value assignment } h.GI(buff, 1, size - 1); instag = true; break; case ')': if (instag) { h.STagC(buff, 0, 0); instag = false; } h.ETag(buff, 1, size - 1); break; case '?': if (instag) { h.STagC(buff, 0, 0); instag = false; } h.PI(buff, 1, size - 1); break; case 'A': int sp = s.IndexOf(' '); h.Aname(buff, 1, sp - 1); h.Aval(buff, sp + 1, size - sp - 1); break; case '-': if (instag) { h.STagC(buff, 0, 0); instag = false; } if (s.Equals("-\\n", StringComparison.Ordinal)) { buff[0] = '\n'; h.PCDATA(buff, 0, 1); } else { // FIXME: // Does not decode \t and \\ in input h.PCDATA(buff, 1, size - 1); } break; case 'E': if (instag) { h.STagC(buff, 0, 0); instag = false; } h.Entity(buff, 1, size - 1); break; default: // System.err.print("Gotcha "); // System.err.print(s); // System.err.print('\n'); break; } } h.EOF(buff, 0, 0); }