Exemplo n.º 1
0
        public static void Parse(Page page, IMarkupWriter markup)
        {
            markup.Pagenumber = page.Pagenumber;

            using (BinaryReader reader = new BinaryReader(new MemoryStream(page.Data)))
            {
                bool hardCarriageReturn = false;

                bool hyphenAtEol             = false;
                bool hyphenAtEolSeparatingCK = false; // ck -> k-k
                bool hyphenInvisible         = false;

                bool hyphen() => hyphenAtEol || hyphenAtEolSeparatingCK || hyphenInvisible;

                bool incompleteWord = false;

                byte font = 0;

                int  atomCounter;
                int  wordCounter;
                bool endOfPage = false;
                for (atomCounter = 0, wordCounter = 0; atomCounter < 20000 && !endOfPage; atomCounter++)
                {
                    byte token = reader.ReadByte();
                    switch (token)
                    {
                    case 0:                      // Blanks
                    {
                        reader.ReadByte();       // number of blanks
                        if (!hardCarriageReturn) // no blank after line break
                        {
                            markup.AddBlank();
                        }

                        hardCarriageReturn = false;
                        break;
                    }

                    case 1:     // Word
                    {
                        int    length = reader.ReadByte();
                        string word   = ReadWord(reader, length, GetEncoding(length, font));
                        if (incompleteWord)     // at beginning of page
                        {
                            incompleteWord = false;
                        }
                        else
                        {
                            int delimiterCount = DelimiterCount(word, hyphen());
                            if (word.Length > 0)
                            {
                                markup.AddWord(word, wordCounter, delimiterCount);
                            }

                            wordCounter += delimiterCount;
                        }

                        hyphenAtEol = hyphenAtEolSeparatingCK = hyphenInvisible = false;
                        if (length > sbyte.MaxValue)     // blank at the end
                        {
                            markup.AddBlank();
                            atomCounter++;
                        }

                        break;
                    }

                    case 2:     // Hard carriage return
                    {
                        hardCarriageReturn = true;
                        markup.AddLineBreak();
                        break;
                    }

                    case 3:     // End of page
                    {
                        endOfPage = true;
                        break;
                    }

                    case 4:     // Italic on
                    {
                        markup.Italic = true;
                        break;
                    }

                    case 5:     // Italic off
                    {
                        markup.Italic = false;
                        break;
                    }

                    case 6:     // Bold on
                    {
                        markup.Bold = true;
                        break;
                    }

                    case 7:     // Bold off
                    {
                        markup.Bold = false;
                        break;
                    }

                    case 8:     // Font preset (size and style)
                    {
                        byte preset = reader.ReadByte();
                        switch (preset)
                        {
                        case 0:
                            markup.FontSize = 1.0f;
                            markup.Bold     = false;
                            markup.Italic   = false;
                            break;

                        case 1:
                            markup.FontSize = 1.34f;
                            break;

                        case 2:
                            markup.FontSize = 1.22f;
                            break;

                        case 3:
                            markup.FontSize = 1.1f;
                            break;

                        case 4:
                            markup.FontSize = 1.0f;
                            markup.Bold     = true;
                            break;

                        case 5:
                            markup.FontSize = 1.0f;
                            break;

                        case 6:
                            markup.FontSize = 1.0f;
                            markup.Italic   = true;
                            break;

                        default:
                            Log.Warn(string.Format("Font preset {0:D} is unknown.", preset));
                            break;
                        }

                        break;
                    }

                    case 9:     // Ly
                    {
                        break;
                    }

                    case 10:     // Image
                    {
                        int    width = reader.ReadInt32();
                        string name  = ReadName(reader);
                        markup.AddBlockImage(name.Replace("#", ""));
                        break;
                    }

                    case 11:     // Image link
                    {
                        string name = ReadName(reader);
                        markup.BeginImageLink(name);
                        break;
                    }

                    case 12:     // End link
                    {
                        markup.EndLink();
                        break;
                    }

                    case 13:     // Font
                    {
                        font = reader.ReadByte();
                        break;
                    }

                    case 14:     // Filename
                    {
                        string filename = ReadName(reader);
                        break;
                    }

                    case 15:     // Concordance
                    {
                        int concordance = reader.ReadUInt16();
                        break;
                    }

                    case 16:     // Node number
                    {
                        int nodenumber = reader.ReadUInt16();
                        break;
                    }

                    case 17:     // Superscript on
                    {
                        markup.Superscript = true;
                        break;
                    }

                    case 18:     // Superscript off
                    {
                        markup.Superscript = false;
                        break;
                    }

                    case 19:     // Sigil
                    {
                        string sigil = ReadName(reader);
                        break;
                    }

                    case 20:     // Header (not generated anymore)
                    {
                        break;
                    }

                    case 21:     // Hyphen at end of line
                    {
                        hyphenAtEol = true;
                        break;
                    }

                    case 22:     // Underlined on
                    {
                        markup.Underline = true;
                        break;
                    }

                    case 23:     // Underlined off
                    {
                        markup.Underline = false;
                        break;
                    }

                    case 24:     // Greek on
                    {
                        break;
                    }

                    case 25:     // Greek off
                    {
                        break;
                    }

                    case 27:     // One blank
                    {
                        markup.AddBlank();
                        break;
                    }

                    case 28:     // Vertical line on
                    {
                        markup.VerticalLine = true;
                        break;
                    }

                    case 29:     // Vertical line off
                    {
                        markup.VerticalLine = false;
                        break;
                    }

                    case 30:     // TD
                    {
                        break;
                    }

                    case 31:     // Null
                    {
                        break;
                    }

                    case 128:     // Page link (replaces image link)
                    {
                        int    pagenumber = reader.ReadInt32();
                        string imageName  = ReadName(reader);
                        if (pagenumber != 0)
                        {
                            markup.BeginPageLink(pagenumber);
                        }
                        else
                        {
                            markup.BeginImageLink(imageName);
                        }

                        break;
                    }

                    case 129:     // ID
                    {
                        reader.ReadByte();
                        break;
                    }

                    case 130:     // End ID
                    {
                        reader.ReadByte();
                        break;
                    }

                    case 131:     // Subscript on
                    {
                        markup.Subscript = true;
                        break;
                    }

                    case 132:     // Subscript off
                    {
                        markup.Subscript = false;
                        break;
                    }

                    case 133:     // Color
                    {
                        markup.Color = reader.ReadBoolean();
                        break;
                    }

                    case 134:     // Image inline
                    {
                        int    width  = reader.ReadUInt16();
                        int    height = reader.ReadUInt16();
                        string name   = ReadName(reader);
                        markup.AddInlineImage(name);
                        break;
                    }

                    case 135:     // Searchword
                    {
                        string searchword = ReadName(reader);
                        break;
                    }

                    case 136:     // Font size
                    {
                        byte fontSize = reader.ReadByte();
                        markup.FontSize = fontSize / 100.0f;
                        break;
                    }

                    case 137:     // Copyright
                    {
                        reader.ReadByte();
                        break;
                    }

                    case 138:     // Auto link
                    {
                        int autoLink = reader.ReadInt32();
                        markup.BeginPageLink(autoLink);
                        break;
                    }

                    case 139:     // Soft carriage return
                    {
                        if (!hyphen())
                        {
                            markup.AddBlank();
                        }

                        break;
                    }

                    case 140:     // Hyphen invisible (e.g. in 1984 between 19 and 84)
                    {
                        hyphenInvisible = true;
                        break;
                    }

                    case 141:     // Letter spacing on
                    {
                        markup.LetterSpacing = true;
                        break;
                    }

                    case 142:     // Letter spacing off
                    {
                        markup.LetterSpacing = false;
                        break;
                    }

                    case 143:     // Half line spacing
                    {
                        markup.AddHalfLineSpace();
                        break;
                    }

                    case 144:     // List item
                    {
                        break;
                    }

                    case 145:     // End list item
                    {
                        break;
                    }

                    case 146:     // Unordered list
                    {
                        break;
                    }

                    case 147:     // End unordered list
                    {
                        break;
                    }

                    case 148:     // Set X (offset left border pixel, is reset after SoftCarriageReturn)
                    {
                        int xValue = reader.ReadUInt16();
                        break;
                    }

                    case 149:               // SV (some sort of link)
                    {
                        reader.ReadInt64(); // jump 8 bytes
                        break;
                    }

                    case 150:     // SV lemma
                    {
                        string lemma = ReadName(reader);
                        break;
                    }

                    case 151:     // No SVFF (stops SV lemma)
                    {
                        break;
                    }

                    case 152:     // Centered on (alignment)
                    {
                        markup.Centered = true;
                        break;
                    }

                    case 153:     // Centered off
                    {
                        markup.Centered = false;
                        break;
                    }

                    case 154:     // Align right on (precedes centered)
                    {
                        markup.Right = true;
                        break;
                    }

                    case 155:     // Align right off
                    {
                        markup.Right = false;
                        break;
                    }

                    case 156:     // E (not used anymore)
                    {
                        reader.ReadUInt16();
                        break;
                    }

                    case 157:     // End E
                    {
                        break;
                    }

                    case 158:     // Biblio page number
                    {
                        reader.ReadInt32();
                        break;
                    }

                    case 159:     // Not first line
                    {
                        break;
                    }

                    case 160:     // Thumb
                    {
                        break;
                    }

                    case 161:     // End new
                    {
                        reader.ReadBytes(3);
                        break;
                    }

                    case 162:     // URL
                    {
                        string url = ReadName(reader);
                        if (url.Length > 0)
                        {
                            markup.BeginUrl(url);
                        }

                        break;
                    }

                    case 163:     // End URL
                    {
                        markup.EndUrl();
                        break;
                    }

                    case 164:     // Word anchor
                    {
                        break;
                    }

                    case 165:     // Thumb www
                    {
                        break;
                    }

                    case 166:     // S
                    {
                        break;
                    }

                    case 167:     // No justification on (alignment)
                    {
                        break;
                    }

                    case 168:     // No justification off
                    {
                        break;
                    }

                    case 169:     // Next blank is fixed
                    {
                        break;
                    }

                    case 170:     // Word rest (which appears on next page)
                    {
                        int    length         = reader.ReadByte();
                        string word           = ReadWord(reader, length, Encoding.GetEncoding(1252));
                        int    delimiterCount = DelimiterCount(word, false);
                        markup.AddWord(word, wordCounter, delimiterCount);
                        wordCounter += delimiterCount;
                        if (length > sbyte.MaxValue)     // blank at the end
                        {
                            markup.AddBlank();
                            atomCounter++;
                        }

                        break;
                    }

                    case 171:     // Incomplete word (at beginning of page)
                    {
                        ReadName(reader);
                        incompleteWord = true;
                        break;
                    }

                    case 172:     // Hyphen CK (e.g. "entwickelte" is separated as "entwik-kelte")
                    {
                        hyphenAtEolSeparatingCK = true;
                        break;
                    }

                    case 173:     // Hebrew on
                    {
                        break;
                    }

                    case 174:     // Hebrew off
                    {
                        break;
                    }

                    case 175:     // NodeNumber2
                    {
                        int nodenumber = reader.ReadInt32();
                        break;
                    }

                    case 176:     // Strikethrough on
                    {
                        markup.Strikethrough = true;
                        break;
                    }

                    case 177:     // Strikethrough off
                    {
                        markup.Strikethrough = false;
                        break;
                    }

                    case 178:                // Set Y
                    {
                        reader.ReadUInt16(); // jump 2
                        break;
                    }

                    case 179:     // Cor (swallows next element)
                    {
                        reader.ReadUInt32();
                        break;
                    }

                    case 180:     // End cor
                    {
                        break;
                    }

                    case 236:     // Thin dashed line (distance 28)
                    {
                        break;
                    }

                    default:
                    {
                        Log.Warn("No matching tag found for token {0} on page {1}.", token, page);
                        endOfPage = true;
                        break;
                    }
                    }

                    endOfPage |= reader.BaseStream.Position == reader.BaseStream.Length;
                }

                if (page.AtomCount != 0 && atomCounter != page.AtomCount)
                {
                    Log.Warn("Number of atoms was {0} but should be {1}.", atomCounter, page.AtomCount);
                }
            }
        }
Exemplo n.º 2
0
 public Converter(IMarkupWriter writer)
 {
     this.writer = writer;
 }
 public Converter(IMarkupWriter writer)
 {
     this.writer = writer;
 }