public OldPAPBinTable(byte[] documentStream, int OffSet, int size, int fcMin, TextPieceTable tpt) { PlexOfCps binTable = new PlexOfCps(documentStream, OffSet, size, 2); int length = binTable.Length; for (int x = 0; x < length; x++) { GenericPropertyNode node = binTable.GetProperty(x); int pageNum = LittleEndian.GetShort(node.Bytes); int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream, documentStream, pageOffset, tpt); int fkpSize = pfkp.Size(); for (int y = 0; y < fkpSize; y++) { PAPX papx = pfkp.GetPAPX(y); _paragraphs.Add(papx); } } _paragraphs.Sort((IComparer<PAPX>)PropertyNode.PAPXComparator.instance); }
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) { //skips through the prms before we reach the piece table. These contain data //for actual fast saved files List<SprmBuffer> sprmBuffers = new List<SprmBuffer>(); //skips through the prms before we reach the piece table. These contain data //for actual fast saved files while (tableStream[offset] == GRPPRL_TYPE) { offset++; int size = LittleEndian.GetShort(tableStream, offset); offset += LittleEndianConsts.SHORT_SIZE; byte[] bs = LittleEndian.GetByteArray(tableStream, offset, size); offset += size; SprmBuffer sprmBuffer = new SprmBuffer(bs, false, 0); sprmBuffers.Add(sprmBuffer); } this._grpprls = sprmBuffers.ToArray(); if (tableStream[offset] != TEXT_PIECE_TABLE_TYPE) { throw new IOException("The text piece table is corrupted"); } int pieceTableSize = LittleEndian.GetInt(tableStream, ++offset); offset += LittleEndianConsts.INT_SIZE; _tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); }
public void TestReadWrite() { TextPieceTable fakeTPT = new TextPieceTable(); FileInformationBlock fib = _hWPFDocFixture._fib; byte[] mainStream = _hWPFDocFixture._mainStream; byte[] tableStream = _hWPFDocFixture._tableStream; _pAPBinTable = new PAPBinTable(mainStream, tableStream, null, fib.GetFcPlcfbtePapx(), fib.GetLcbPlcfbtePapx(), fakeTPT); HWPFFileSystem fileSys = new HWPFFileSystem(); _pAPBinTable.WriteTo(fileSys, fakeTPT); MemoryStream tableOut = fileSys.GetStream("1Table"); MemoryStream mainOut = fileSys.GetStream("WordDocument"); byte[] newTableStream = tableOut.ToArray(); byte[] newMainStream = mainOut.ToArray(); PAPBinTable newBinTable = new PAPBinTable(newMainStream, newTableStream, null, 0, newTableStream.Length, fakeTPT); List<PAPX> oldTextRuns = _pAPBinTable.GetParagraphs(); List<PAPX> newTextRuns = newBinTable.GetParagraphs(); Assert.AreEqual(oldTextRuns.Count, newTextRuns.Count); int size = oldTextRuns.Count; for (int x = 0; x < size; x++) { PropertyNode oldNode = (PropertyNode)oldTextRuns[x]; PropertyNode newNode = (PropertyNode)newTextRuns[x]; Assert.IsTrue(oldNode.Equals(newNode)); } }
public void TestReadWrite() { FileInformationBlock fib = _hWPFDocFixture._fib; byte[] mainStream = _hWPFDocFixture._mainStream; byte[] tableStream = _hWPFDocFixture._tableStream; int fcMin = fib.GetFcMin(); CPSplitCalculator cps = new CPSplitCalculator(fib); ComplexFileTable cft = new ComplexFileTable(mainStream, tableStream, fib.GetFcClx(), fcMin); TextPieceTable tpt = cft.GetTextPieceTable(); SectionTable sectionTable = new SectionTable(mainStream, tableStream, fib.GetFcPlcfsed(), fib.GetLcbPlcfsed(), fcMin, tpt, cps); HWPFFileSystem fileSys = new HWPFFileSystem(); sectionTable.WriteTo(fileSys, 0); MemoryStream tableOut = fileSys.GetStream("1Table"); MemoryStream mainOut = fileSys.GetStream("WordDocument"); byte[] newTableStream = tableOut.ToArray(); byte[] newMainStream = mainOut.ToArray(); SectionTable newSectionTable = new SectionTable( newMainStream, newTableStream, 0, newTableStream.Length, 0, tpt, cps); List <SEPX> oldSections = sectionTable.GetSections(); List <SEPX> newSections = newSectionTable.GetSections(); Assert.AreEqual(oldSections.Count, newSections.Count); //test for proper char offset conversions PlexOfCps oldSedPlex = new PlexOfCps(tableStream, fib.GetFcPlcfsed(), fib.GetLcbPlcfsed(), 12); PlexOfCps newSedPlex = new PlexOfCps(newTableStream, 0, newTableStream.Length, 12); Assert.AreEqual(oldSedPlex.Length, newSedPlex.Length); for (int x = 0; x < oldSedPlex.Length; x++) { Assert.AreEqual(oldSedPlex.GetProperty(x).Start, newSedPlex.GetProperty(x).Start); Assert.AreEqual(oldSedPlex.GetProperty(x).End, newSedPlex.GetProperty(x).End); } int size = oldSections.Count; for (int x = 0; x < size; x++) { PropertyNode oldNode = (PropertyNode)oldSections[x]; PropertyNode newNode = (PropertyNode)newSections[x]; Assert.AreEqual(oldNode, newNode); } }
/** * Constructor used to read an old-style binTable * in from a Word document. * * @param documentStream * @param offset * @param size * @param fcMin */ public OldCHPBinTable(byte[] documentStream, int OffSet, int size, int fcMin, TextPieceTable tpt) { PlexOfCps binTable = new PlexOfCps(documentStream, OffSet, size, 2); int length = binTable.Length; for (int x = 0; x < length; x++) { GenericPropertyNode node = binTable.GetProperty(x); int pageNum = LittleEndian.GetShort(node.Bytes); int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, pageOffset, fcMin, tpt); int fkpSize = cfkp.Size(); for (int y = 0; y < fkpSize; y++) { _textRuns.Add(cfkp.GetCHPX(y)); } } }
public HWPFOldDocument(DirectoryNode directory) : base(directory) { // Where are things? int sedTableOffset = LittleEndian.GetInt(_mainStream, 0x88); int sedTableSize = LittleEndian.GetInt(_mainStream, 0x8c); int chpTableOffset = LittleEndian.GetInt(_mainStream, 0xb8); int chpTableSize = LittleEndian.GetInt(_mainStream, 0xbc); int papTableOffset = LittleEndian.GetInt(_mainStream, 0xc0); int papTableSize = LittleEndian.GetInt(_mainStream, 0xc4); //int shfTableOffset = LittleEndian.GetInt(_mainStream, 0x60); //int shfTableSize = LittleEndian.GetInt(_mainStream, 0x64); int complexTableOffset = LittleEndian.GetInt(_mainStream, 0x160); // We need to get hold of the text that Makes up the // document, which might be regular or fast-saved StringBuilder text = new StringBuilder(); if (_fib.IsFComplex()) { ComplexFileTable cft = new ComplexFileTable( _mainStream, _mainStream, complexTableOffset, _fib.GetFcMin() ); tpt = cft.GetTextPieceTable(); foreach (TextPiece tp in tpt.TextPieces) { text.Append(tp.GetStringBuilder()); } } else { // TODO Discover if these older documents can ever hold Unicode Strings? // (We think not, because they seem to lack a Piece table) // TODO Build the Piece Descriptor properly // (We have to fake it, as they don't seem to have a proper Piece table) PieceDescriptor pd = new PieceDescriptor(new byte[] { 0, 0, 0, 0, 0, 127, 0, 0 }, 0); pd.FilePosition = _fib.GetFcMin(); // Generate a single Text Piece Table, with a single Text Piece // which covers all the (8 bit only) text in the file tpt = new TextPieceTable(); byte[] textData = new byte[_fib.GetFcMac() - _fib.GetFcMin()]; Array.Copy(_mainStream, _fib.GetFcMin(), textData, 0, textData.Length); TextPiece tp = new TextPiece( 0, textData.Length, textData, pd ); tpt.Add(tp); text.Append(tp.GetStringBuilder()); } _text = tpt.Text; // Now we can fetch the character and paragraph properties _cbt = new OldCHPBinTable( _mainStream, chpTableOffset, chpTableSize, _fib.GetFcMin(), tpt ); _pbt = new OldPAPBinTable( _mainStream, chpTableOffset, papTableSize, _fib.GetFcMin(), tpt ); _st = new OldSectionTable( _mainStream, chpTableOffset, sedTableSize, _fib.GetFcMin(), tpt ); }
public HWPFOldDocument(DirectoryNode directory) : base(directory) { // Where are things? int sedTableOffset = LittleEndian.GetInt(_mainStream, 0x88); int sedTableSize = LittleEndian.GetInt(_mainStream, 0x8c); int chpTableOffset = LittleEndian.GetInt(_mainStream, 0xb8); int chpTableSize = LittleEndian.GetInt(_mainStream, 0xbc); int papTableOffset = LittleEndian.GetInt(_mainStream, 0xc0); int papTableSize = LittleEndian.GetInt(_mainStream, 0xc4); //int shfTableOffset = LittleEndian.GetInt(_mainStream, 0x60); //int shfTableSize = LittleEndian.GetInt(_mainStream, 0x64); int complexTableOffset = LittleEndian.GetInt(_mainStream, 0x160); // We need to get hold of the text that Makes up the // document, which might be regular or fast-saved StringBuilder text = new StringBuilder(); if (_fib.IsFComplex()) { ComplexFileTable cft = new ComplexFileTable( _mainStream, _mainStream, complexTableOffset, _fib.GetFcMin() ); tpt = cft.GetTextPieceTable(); foreach (TextPiece tp in tpt.TextPieces) { text.Append(tp.GetStringBuilder()); } } else { // TODO Discover if these older documents can ever hold Unicode Strings? // (We think not, because they seem to lack a Piece table) // TODO Build the Piece Descriptor properly // (We have to fake it, as they don't seem to have a proper Piece table) PieceDescriptor pd = new PieceDescriptor(new byte[] { 0, 0, 0, 0, 0, 127, 0, 0 }, 0); pd.FilePosition = _fib.GetFcMin(); // Generate a single Text Piece Table, with a single Text Piece // which covers all the (8 bit only) text in the file tpt = new TextPieceTable(); byte[] textData = new byte[_fib.GetFcMac() - _fib.GetFcMin()]; Array.Copy(_mainStream, _fib.GetFcMin(), textData, 0, textData.Length); TextPiece tp = new TextPiece( 0, textData.Length, textData, pd ); tpt.Add(tp); text.Append(tp.GetStringBuilder()); } _text = tpt.Text; // Now we can fetch the character and paragraph properties _cbt = new OldCHPBinTable( _mainStream, chpTableOffset, chpTableSize, _fib.GetFcMin(), tpt ); _pbt = new OldPAPBinTable( _mainStream, chpTableOffset, papTableSize, _fib.GetFcMin(), tpt ); _st = new OldSectionTable( _mainStream, chpTableOffset, sedTableSize, _fib.GetFcMin(), tpt ); }
public SectionTable(byte[] documentStream, byte[] tableStream, int OffSet, int size, int fcMin, TextPieceTable tpt, CPSplitCalculator cps) { PlexOfCps sedPlex = new PlexOfCps(tableStream, OffSet, size, SED_SIZE); this.tpt = tpt; this._text = tpt.TextPieces; int length = sedPlex.Length; for (int x = 0; x < length; x++) { GenericPropertyNode node = sedPlex.GetProperty(x); SectionDescriptor sed = new SectionDescriptor(node.Bytes, 0); int fileOffset = sed.GetFc(); int startAt = CPtoFC(node.Start); int endAt = CPtoFC(node.End); // check for the optimization if (fileOffset == unchecked((int)0xffffffff)) { _sections.Add(new SEPX(sed, startAt, endAt, new byte[0])); } else { // The first short at the offset is the size of the grpprl. int sepxSize = LittleEndian.GetShort(documentStream, fileOffset); byte[] buf = new byte[sepxSize]; fileOffset += LittleEndianConsts.SHORT_SIZE; Array.Copy(documentStream, fileOffset, buf, 0, buf.Length); _sections.Add(new SEPX(sed, startAt, endAt, buf)); } } // Some files seem to lie about their unicode status, which // is very very pesky. Try to work around these, but this // is Getting on for black magic... int mainEndsAt = cps.GetMainDocumentEnd(); bool matchAt = false; bool matchHalf = false; for (int i = 0; i < _sections.Count; i++) { SEPX s = _sections[i]; if (s.End == mainEndsAt) { matchAt = true; } else if (s.EndBytes == mainEndsAt || s.EndBytes == mainEndsAt - 1) { matchHalf = true; } } if (!matchAt && matchHalf) { //System.err.println("Your document seemed to be mostly unicode, but the section defInition was in bytes! Trying anyway, but things may well go wrong!"); for (int i = 0; i < _sections.Count; i++) { SEPX s = _sections[i]; GenericPropertyNode node = sedPlex.GetProperty(i); int startAt = node.Start; int endAt = node.End; s.Start = (startAt); s.End = (endAt); } } }
public ComplexFileTable() { _tpt = new TextPieceTable(); }
/// <summary> /// Initializes a new instance of the <see cref="HWPFDocument"/> class. /// </summary> /// <param name="directory">The directory.</param> public HWPFDocument(DirectoryNode directory) : base(directory) { _endnotes = new NotesImpl(_endnotesTables); _footnotes = new NotesImpl(_footnotesTables); // Load the main stream and FIB // Also handles HPSF bits // Do the CP Split _cpSplit = new CPSplitCalculator(_fib); // Is this document too old for us? if (_fib.GetNFib() < 106) { throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?"); } // use the fib to determine the name of the table stream. String name = "0Table"; if (_fib.IsFWhichTblStm()) { name = "1Table"; } // Grab the table stream. DocumentEntry tableProps; try { tableProps = (DocumentEntry)directory.GetEntry(name); } catch (FileNotFoundException) { throw new InvalidOperationException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); } // read in the table stream. _tableStream = new byte[tableProps.Size]; directory.CreatePOIFSDocumentReader(name).Read(_tableStream); _fib.FillVariableFields(_mainStream, _tableStream); // read in the data stream. try { DocumentEntry dataProps = (DocumentEntry)directory.GetEntry("Data"); _dataStream = new byte[dataProps.Size]; directory.CreatePOIFSDocumentReader("Data").Read(_dataStream); } catch (FileNotFoundException) { _dataStream = new byte[0]; } // Get the cp of the start of text in the main stream // The latest spec doc says this is always zero! int fcMin = 0; //fcMin = _fib.GetFcMin() // Start to load up our standard structures. _dop = new DocumentProperties(_tableStream, _fib.GetFcDop()); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.GetFcClx(), fcMin); TextPieceTable _tpt = _cft.GetTextPieceTable(); // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.GetFcPlcfbteChpx(), _fib.GetLcbPlcfbteChpx(), _tpt); _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.GetFcPlcfbtePapx(), _fib.GetLcbPlcfbtePapx(), _tpt); _text = _tpt.Text; /* * in this mode we preserving PAPX/CHPX structure from file, so text may * miss from output, and text order may be corrupted */ bool preserveBinTables = false; try { preserveBinTables = Boolean.Parse( ConfigurationManager.AppSettings[PROPERTY_PRESERVE_BIN_TABLES]); } catch (Exception) { // ignore; } if (!preserveBinTables) { _cbt.Rebuild(_cft); _pbt.Rebuild(_text, _cft); } /* * Property to disable text rebuilding. In this mode changing the text * will lead to unpredictable behavior */ bool preserveTextTable = false; try { preserveTextTable = Boolean.Parse( ConfigurationManager.AppSettings[PROPERTY_PRESERVE_TEXT_TABLE]); } catch (Exception) { // ignore; } if (!preserveTextTable) { _cft = new ComplexFileTable(); _tpt = _cft.GetTextPieceTable(); TextPiece textPiece = new SinglentonTextPiece(_text); _tpt.Add(textPiece); _text = textPiece.GetStringBuilder(); } // Read FSPA and Escher information // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); _fspaHeaders = new FSPATable(_tableStream, _fib, FSPADocumentPart.HEADER); _fspaMain = new FSPATable(_tableStream, _fib, FSPADocumentPart.MAIN); if (_fib.GetFcDggInfo() != 0) { _dgg = new EscherRecordHolder(_tableStream, _fib.GetFcDggInfo(), _fib.GetLcbDggInfo()); } else { _dgg = new EscherRecordHolder(); } // read in the pictures stream _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspa, _dgg); // And the art shapes stream _officeArts = new ShapesTable(_tableStream, _fib); // And escher pictures _officeDrawingsHeaders = new OfficeDrawingsImpl(_fspaHeaders, _dgg, _mainStream); _officeDrawingsMain = new OfficeDrawingsImpl(_fspaMain, _dgg, _mainStream); _st = new SectionTable(_mainStream, _tableStream, _fib.GetFcPlcfsed(), _fib.GetLcbPlcfsed(), fcMin, _tpt, _cpSplit); _ss = new StyleSheet(_tableStream, _fib.GetFcStshf()); _ft = new FontTable(_tableStream, _fib.GetFcSttbfffn(), _fib.GetLcbSttbfffn()); int listOffset = _fib.GetFcPlcfLst(); int lfoOffset = _fib.GetFcPlfLfo(); if (listOffset != 0 && _fib.GetLcbPlcfLst() != 0) { _lt = new ListTables(_tableStream, _fib.GetFcPlcfLst(), _fib.GetFcPlfLfo()); } int sbtOffset = _fib.GetFcSttbSavedBy(); int sbtLength = _fib.GetLcbSttbSavedBy(); if (sbtOffset != 0 && sbtLength != 0) { _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength); } int rmarkOffset = _fib.GetFcSttbfRMark(); int rmarkLength = _fib.GetLcbSttbfRMark(); if (rmarkOffset != 0 && rmarkLength != 0) { _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength); } _bookmarksTables = new BookmarksTables(_tableStream, _fib); _bookmarks = new BookmarksImpl(_bookmarksTables); _endnotesTables = new NotesTables(NoteType.ENDNOTE, _tableStream, _fib); _endnotes = new NotesImpl(_endnotesTables); _footnotesTables = new NotesTables(NoteType.FOOTNOTE, _tableStream, _fib); _footnotes = new NotesImpl(_footnotesTables); _fieldsTables = new FieldsTables(_tableStream, _fib); _fields = new FieldsImpl(_fieldsTables); }
public OldSectionTable(byte[] documentStream, int offset, int size, int fcMin, TextPieceTable tpt):this(documentStream, offset, size) { }
/** * This constructs a CHPFormattedDiskPage from a raw fkp (512 byte array * read from a Word file). */ public CHPFormattedDiskPage(byte[] documentStream, int offset, int fcMin, TextPieceTable tpt) : this( documentStream, offset, tpt ) { }