/// <summary> Print the contents of the tag.</summary> /// <returns> An string describing the tag. For text that looks like HTML use #toHtml(). /// </returns> public override System.String ToString() { System.String text; System.String type; Cursor start; Cursor end; System.Text.StringBuilder ret; text = GetText(); ret = new System.Text.StringBuilder(20 + text.Length); if (IsEndTag()) type = "End"; else type = "Tag"; start = new Cursor(Page, StartPosition); end = new Cursor(Page, EndPosition); ret.Append(type); ret.Append(" ("); ret.Append(start); ret.Append(","); ret.Append(end); ret.Append("): "); if (80 < ret.Length + text.Length) { text = text.Substring(0, (77 - ret.Length) - (0)); ret.Append(text); ret.Append("..."); } else ret.Append(text); return (ret.ToString()); }
/// <summary> Reset the lexer to start parsing from the beginning again. /// The underlying components are reset such that the next call to /// <code>nextNode()</code> will return the first lexeme on the page. /// </summary> public virtual void Reset() { Page.Reset(); Cursor = new Cursor(Page, 0); }
/// <summary> Fetch the object at the given index.</summary> /// <param name="index">The item number to get. /// </param> /// <param name="reuse">If this argument is not null, it is an object /// acquired from a previous fetch that is no longer needed and /// may be returned as the result if it makes mores sense to alter /// and return it than to fetch or create a new element. That is, the /// reuse object is garbage and may be used to avoid allocating a new /// object if that would normally be the strategy. /// </param> /// <returns> The Ordered object at that index. /// </returns> public virtual IOrdered Fetch(int index, IOrdered reuse) { Cursor ret; if (null != reuse) { ret = (Cursor) reuse; ret.mPosition = mIndices[index]; ret.mPage = Page; // redundant } else ret = new Cursor(Page, mIndices[index]); return (ret); }
/// <summary> Add an element to the list</summary> /// <param name="cursor">The element to add. /// </param> /// <returns> The position at which the element was inserted or /// the index of the existing element if it is a duplicate. /// </returns> public virtual int Add(Cursor cursor) { int position; int last; int ret; position = cursor.Position; if (0 == mCount) { ret = 0; InsertElementAt(position, ret); } else { last = mIndices[mCount - 1]; if (position == last) ret = mCount - 1; else if (position > last) { ret = mCount; InsertElementAt(position, ret); } else { // find where it goes ret = SortImpl.Bsearch(this, cursor); // insert, but not twice if (!((ret < Size()) && (position == mIndices[ret]))) InsertElementAt(position, ret); } } return (ret); }
/// <summary> Get the line number for a cursor.</summary> /// <param name="cursor">The character offset into the page. /// </param> /// <returns> The line number the character is in. /// </returns> public virtual int Row(Cursor cursor) { int ret; ret = SortImpl.Bsearch(this, cursor); // handle line transition, the search returns the index if it matches // exactly one of the line end positions, so we advance one line if // it's equal to the offset at the row index, since that position is // actually the beginning of the next line if ((ret < mCount) && (cursor.Position == mIndices[ret])) ret++; return (ret); }
/// <summary> Get the column number for a cursor.</summary> /// <param name="cursor">The character offset into the page. /// </param> /// <returns> The character offset into the line this cursor is on. /// </returns> public virtual int Column(Cursor cursor) { int row; int previous; row = Row(cursor); if (0 != row) previous = this.ElementAt(row - 1); else previous = 0; return (cursor.Position - previous); }
/// <summary> Express this string node as a printable string /// This is suitable for display in a debugger or output to a printout. /// Control characters are replaced by their equivalent escape /// sequence and contents is truncated to 80 characters. /// </summary> /// <returns> A string representation of the string node. /// </returns> public override System.String ToString() { int startpos; int endpos; Cursor start; Cursor end; char c; System.Text.StringBuilder ret; startpos = StartPosition; endpos = EndPosition; ret = new System.Text.StringBuilder(endpos - startpos + 20); if (null == mText) { start = new Cursor(Page, startpos); end = new Cursor(Page, endpos); ret.Append("Txt ("); ret.Append(start); ret.Append(","); ret.Append(end); ret.Append("): "); while (start.Position < endpos) { try { c = mPage.GetCharacter(start); switch (c) { case '\t': ret.Append("\\t"); break; case '\n': ret.Append("\\n"); break; case '\r': ret.Append("\\r"); break; default: ret.Append(c); break; } } catch (ParserException pe) { // not really expected, but we're only doing toString, so ignore } if (77 <= ret.Length) { ret.Append("..."); break; } } } else { ret.Append("Txt ("); ret.Append(startpos); ret.Append(","); ret.Append(endpos); ret.Append("): "); for (int i = 0; i < mText.Length; i++) { c = mText[i]; switch (c) { case '\t': ret.Append("\\t"); break; case '\n': ret.Append("\\n"); break; case '\r': ret.Append("\\r"); break; default: ret.Append(c); break; } if (77 <= ret.Length) { ret.Append("..."); break; } } } return (ret.ToString()); }
/// <summary> Decode script encoded by the Microsoft obfuscator.</summary> /// <param name="page">The source for encoded text. /// </param> /// <param name="cursor">The position at which to start decoding. /// This is advanced to the end of the encoded text. /// </param> /// <returns> The plaintext. /// </returns> /// <exception cref="ParserException">If an error is discovered while decoding. /// </exception> public static System.String Decode(Page page, Cursor cursor) { int state; int substate_initial; int substate_length; int substate_prefix; int substate_checksum; int substate_final; long checksum; long length; char[] buffer; buffer = new char[6]; int index; char character; int input_character; bool found; System.Text.StringBuilder ret; ret = new System.Text.StringBuilder(1024); state = STATE_INITIAL; substate_initial = 0; substate_length = 0; substate_prefix = 0; substate_checksum = 0; substate_final = 0; length = 0L; checksum = 0L; index = 0; while (STATE_DONE != state) { input_character = page.GetCharacter(cursor); character = (char) input_character; if (Page.EOF == input_character) { if ((STATE_INITIAL != state) || (0 != substate_initial) || (0 != substate_length) || (0 != substate_prefix) || (0 != substate_checksum) || (0 != substate_final)) throw new ParserException("illegal state for exit"); state = STATE_DONE; } else switch (state) { case STATE_INITIAL: if (character == mLeader[substate_initial]) { substate_initial++; if (substate_initial == mLeader.Length) { substate_initial = 0; state = STATE_LENGTH; } } else { // oops, flush for (int k = 0; 0 < substate_initial; k++) { ret.Append(mLeader[k++]); substate_initial--; } ret.Append(character); } break; case STATE_LENGTH: buffer[substate_length] = character; substate_length++; if (substate_length >= buffer.Length) { length = DecodeBase64(buffer); if (0 > length) throw new ParserException("illegal length: " + length); substate_length = 0; state = STATE_PREFIX; } break; case STATE_PREFIX: if (character == mPrefix[substate_prefix]) substate_prefix++; else throw new ParserException("illegal character encountered: " + (int) character + " ('" + character + "')"); if (substate_prefix >= mPrefix.Length) { substate_prefix = 0; state = STATE_DECODE; } break; case STATE_DECODE: if ('@' == character) state = STATE_ESCAPE; else { if (input_character < 0x80) { if (input_character == '\t') input_character = 0; else if (input_character >= ' ') input_character -= (' ' - 1); else throw new ParserException("illegal encoded character: " + input_character + " ('" + character + "')"); char ch = mLookupTable[mEncodingIndex[index % 64]][input_character]; ret.Append(ch); checksum += ch; index++; } else ret.Append(character); } length--; if (0 == length) { index = 0; state = STATE_CHECKSUM; } break; case STATE_ESCAPE: found = false; for (int i = 0; i < mEscapes.Length; i++) if (character == mEscapes[i]) { found = true; character = mEscaped[i]; } if (!found) throw new ParserException("unexpected escape character: " + (int) character + " ('" + character + "')"); ret.Append(character); checksum += character; index++; state = STATE_DECODE; length--; if (0 == length) { index = 0; state = STATE_CHECKSUM; } break; case STATE_CHECKSUM: buffer[substate_checksum] = character; substate_checksum++; if (substate_checksum >= buffer.Length) { long check = DecodeBase64(buffer); if (check != checksum) throw new ParserException("incorrect checksum, expected " + check + ", calculated " + checksum); checksum = 0; substate_checksum = 0; state = STATE_FINAL; } break; case STATE_FINAL: if (character == mTrailer[substate_final]) substate_final++; else throw new ParserException("illegal character encountered: " + (int) character + " ('" + character + "')"); if (substate_final >= mTrailer.Length) { substate_final = 0; state = LAST_STATE; } break; default: throw new ParserException("invalid state: " + state); } } return (ret.ToString()); }
/// <summary> Get the column number for a cursor.</summary> /// <param name="cursor">The character offset into the page. /// </param> /// <returns> The character offset into the line this cursor is on. /// </returns> public virtual int Column(Cursor cursor) { return (mIndex.Column(cursor)); }
/// <summary> Get the text line the position of the cursor lies on.</summary> /// <param name="cursor">The position to calculate for. /// </param> /// <returns> The contents of the URL or file corresponding to the line number /// containg the cursor position. /// </returns> public virtual System.String GetLine(Cursor cursor) { int line; int size; int start; int end; if (mSource == null) { this.GetPageContent(this.mConnection, false); } line = Row(cursor); size = mIndex.Size(); if (line < size) { start = mIndex.ElementAt(line); line++; if (line <= size) end = mIndex.ElementAt(line); else end = mSource.Offset(); } // current line else { start = mIndex.ElementAt(line - 1); end = mSource.Offset(); } return (GetText(start, end)); }
/// <summary> Get the line number for a cursor.</summary> /// <param name="cursor">The character offset into the page. /// </param> /// <returns> The line number the character is in. /// </returns> public virtual int Row(Cursor cursor) { return (mIndex.Row(cursor)); }
/// <summary> Read the character at the given cursor position. /// The cursor position can be only behind or equal to the /// current source position. /// Returns end of lines (EOL) as \n, by converting \r and \r\n to \n, /// and updates the end-of-line index accordingly /// Advances the cursor position by one (or two in the \r\n case). /// </summary> /// <param name="cursor">The position to read at. /// </param> /// <returns> The character at that position, and modifies the cursor to /// prepare for the next read. If the source is exhausted a zero is returned. /// </returns> /// <exception cref="ParserException">If an IOException on the underlying source /// occurs, or an attemp is made to read characters in the future (the /// cursor position is ahead of the underlying stream) /// </exception> public virtual char GetCharacter(Cursor cursor) { int i; char ret; i = cursor.Position; if (mSource.Offset() < i) // hmmm, we could skip ahead, but then what about the EOL index throw new ParserException("Attempt to read future characters from source " + i + " > " + mSource.Offset()); else if (mSource.Offset() == i) try { i = mSource.Read(); if (Source.EOF == i) ret = EOF; else { ret = (char) i; cursor.Advance(); } } catch (System.IO.IOException ioe) { throw new ParserException("Problem reading a character at position " + cursor.Position, ioe); } else { // historic read try { ret = mSource.GetCharacter(i); } catch (System.IO.IOException ioe) { throw new ParserException("Can't read a character at position " + i, ioe); } cursor.Advance(); } // handle \r if ('\r' == ret) { // switch to single character EOL ret = '\n'; // check for a \n in the next position if (mSource.Offset() == cursor.Position) try { i = mSource.Read(); if (Source.EOF == i) { // do nothing } else if ('\n' == (char) i) cursor.Advance(); else try { mSource.Unread(); } catch (System.IO.IOException ioe) { throw new ParserException("Can't unread a character at position " + cursor.Position, ioe); } } catch (System.IO.IOException ioe) { throw new ParserException("Problem reading a character at position " + cursor.Position, ioe); } else try { if ('\n' == mSource.GetCharacter(cursor.Position)) cursor.Advance(); } catch (System.IO.IOException ioe) { throw new ParserException("can't read a character at position " + cursor.Position, ioe); } } if ('\n' == ret) // update the EOL index in any case mIndex.Add(cursor); return (ret); }
/// <summary> Deserialize the page. /// For details see <code>writeObject()</code>. /// </summary> /// <param name="in">The object stream to decode. /// </param> /// <exception cref="IOException">If there is a deserialization problem with /// the stream. /// </exception> /// <exception cref="ClassNotFoundException">If the deserialized class can't be /// located with the current classpath and class loader. /// </exception> protected Page(System.Runtime.Serialization.SerializationInfo in_Renamed, System.Runtime.Serialization.StreamingContext context) { bool fromurl; int offset; System.String href; System.Uri url; Cursor cursor; fromurl = in_Renamed.GetBoolean("Winista.Text.Htmlparser.Lex.Pagedata1"); if (fromurl) { offset = in_Renamed.GetInt32("Winista.Text.Htmlparser.Lex.Pagedata2"); href = ((System.String) in_Renamed.GetValue("Winista.Text.Htmlparser.Lex.Pagedata3", typeof(System.String))); Support.SupportMisc.DefaultReadObject(in_Renamed, context, this); // open the URL if (null != Url) { url = new System.Uri(Url); try { //Connection = (System.Net.HttpWebRequest) System.Net.WebRequest.Create(url); } catch (ParserException pe) { throw new System.IO.IOException(pe.Message); } } cursor = new Cursor(this, 0); for (int i = 0; i < offset; i++) try { GetCharacter(cursor); } catch (ParserException pe) { throw new System.IO.IOException(pe.Message); } Url = href; } else { href = ((System.String) in_Renamed.GetValue("Winista.Text.Htmlparser.Lex.Pagedata4", typeof(System.String))); Support.SupportMisc.DefaultReadObject(in_Renamed, context, this); Url = href; } }
/// <summary> Advance the cursor through a JIS escape sequence.</summary> /// <param name="cursor">A cursor positioned within the escape sequence. /// </param> /// <exception cref="ParserException">If a problem occurs reading from the source. /// </exception> protected internal virtual void ScanJIS(Cursor cursor) { bool done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.GetCharacter(cursor); if (Page.EOF == ch) { done = true; } else { switch (state) { case 0: if (0x1b == ch) // escape state = 1; break; case 1: if ('(' == ch) state = 2; else state = 0; break; case 2: if ('J' == ch) done = true; else state = 0; break; default: throw new System.SystemException("state " + state); } } } }
/// <summary> Remove an element from the list</summary> /// <param name="cursor">The element to remove. /// </param> public virtual void Remove(Cursor cursor) { int i; // find it i = SortImpl.Bsearch(this, cursor); // remove if ((i < Size()) && (cursor.Position == mIndices[i])) RemoveElementAt(i); }
/// <summary> Creates a new instance of a Lexer.</summary> /// <param name="page">The page with HTML text. /// </param> public Lexer(Page page) { Page = page; Cursor = new Cursor(page, 0); NodeFactory = this; }
public IteratorImpl(Lexer lexer, IParserFeedBack fb) { mLexer = lexer; mFeedback = fb; mCursor = new Cursor(mLexer.Page, 0); }