public void TestReadWrite() { FileInformationBlock fib = _hWPFDocFixture._fib; byte[] mainStream = _hWPFDocFixture._mainStream; byte[] tableStream = _hWPFDocFixture._tableStream; int fcMin = fib.GetFcMin(); ComplexFileTable cft = new ComplexFileTable(mainStream, tableStream, fib.GetFcClx(), fcMin); HWPFFileSystem fileSys = new HWPFFileSystem(); cft.WriteTo(fileSys); MemoryStream tableOut = fileSys.GetStream("1Table"); MemoryStream mainOut = fileSys.GetStream("WordDocument"); byte[] newTableStream = tableOut.ToArray(); byte[] newMainStream = mainOut.ToArray(); ComplexFileTable newCft = new ComplexFileTable(newMainStream, newTableStream, 0, 0); TextPieceTable oldTextPieceTable = cft.GetTextPieceTable(); TextPieceTable newTextPieceTable = newCft.GetTextPieceTable(); Assert.AreEqual(oldTextPieceTable.Text.ToString(), newTextPieceTable.Text.ToString()); }
public void TestReadWrite() { FileInformationBlock fib = _hWPFDocFixture._fib; byte[] mainStream = _hWPFDocFixture._mainStream; byte[] tableStream = _hWPFDocFixture._tableStream; int fcMin = fib.GetFcMin(); ComplexFileTable cft = new ComplexFileTable(mainStream, tableStream, fib.GetFcClx(), fcMin); HWPFFileSystem fileSys = new HWPFFileSystem(); cft.WriteTo(fileSys); MemoryStream tableOut = fileSys.GetStream("1Table"); MemoryStream mainOut = fileSys.GetStream("WordDocument"); byte[] newTableStream = tableOut.ToArray(); byte[] newMainStream = mainOut.ToArray(); ComplexFileTable newCft = new ComplexFileTable(newMainStream, newTableStream, 0, 0); TextPieceTable oldTextPieceTable = cft.GetTextPieceTable(); TextPieceTable newTextPieceTable = newCft.GetTextPieceTable(); Assert.AreEqual(oldTextPieceTable.Text.ToString(), newTextPieceTable.Text.ToString()); }
public void TestReadWrite() { FileInformationBlock fib = _hWPFDocFixture._fib; byte[] mainStream = _hWPFDocFixture._mainStream; byte[] tableStream = _hWPFDocFixture._tableStream; int fcMin = fib.GetFcMin(); CPSplitCalculator cps = new CPSplitCalculator(fib); ComplexFileTable cft = new ComplexFileTable(mainStream, tableStream, fib.GetFcClx(), fcMin); TextPieceTable tpt = cft.GetTextPieceTable(); SectionTable sectionTable = new SectionTable(mainStream, tableStream, fib.GetFcPlcfsed(), fib.GetLcbPlcfsed(), fcMin, tpt, cps); HWPFFileSystem fileSys = new HWPFFileSystem(); sectionTable.WriteTo(fileSys, 0); MemoryStream tableOut = fileSys.GetStream("1Table"); MemoryStream mainOut = fileSys.GetStream("WordDocument"); byte[] newTableStream = tableOut.ToArray(); byte[] newMainStream = mainOut.ToArray(); SectionTable newSectionTable = new SectionTable( newMainStream, newTableStream, 0, newTableStream.Length, 0, tpt, cps); List<SEPX> oldSections = sectionTable.GetSections(); List<SEPX> newSections = newSectionTable.GetSections(); Assert.AreEqual(oldSections.Count, newSections.Count); //test for proper char offset conversions PlexOfCps oldSedPlex = new PlexOfCps(tableStream, fib.GetFcPlcfsed(), fib.GetLcbPlcfsed(), 12); PlexOfCps newSedPlex = new PlexOfCps(newTableStream, 0, newTableStream.Length, 12); Assert.AreEqual(oldSedPlex.Length, newSedPlex.Length); for (int x = 0; x < oldSedPlex.Length; x++) { Assert.AreEqual(oldSedPlex.GetProperty(x).Start, newSedPlex.GetProperty(x).Start); Assert.AreEqual(oldSedPlex.GetProperty(x).End, newSedPlex.GetProperty(x).End); } int size = oldSections.Count; for (int x = 0; x < size; x++) { PropertyNode oldNode = (PropertyNode)oldSections[x]; PropertyNode newNode = (PropertyNode)newSections[x]; Assert.AreEqual(oldNode, newNode); } }
public HWPFOldDocument(DirectoryNode directory) : base(directory) { // Where are things? int sedTableOffset = LittleEndian.GetInt(_mainStream, 0x88); int sedTableSize = LittleEndian.GetInt(_mainStream, 0x8c); int chpTableOffset = LittleEndian.GetInt(_mainStream, 0xb8); int chpTableSize = LittleEndian.GetInt(_mainStream, 0xbc); int papTableOffset = LittleEndian.GetInt(_mainStream, 0xc0); int papTableSize = LittleEndian.GetInt(_mainStream, 0xc4); //int shfTableOffset = LittleEndian.GetInt(_mainStream, 0x60); //int shfTableSize = LittleEndian.GetInt(_mainStream, 0x64); int complexTableOffset = LittleEndian.GetInt(_mainStream, 0x160); // We need to get hold of the text that Makes up the // document, which might be regular or fast-saved StringBuilder text = new StringBuilder(); if (_fib.IsFComplex()) { ComplexFileTable cft = new ComplexFileTable( _mainStream, _mainStream, complexTableOffset, _fib.GetFcMin() ); tpt = cft.GetTextPieceTable(); foreach (TextPiece tp in tpt.TextPieces) { text.Append(tp.GetStringBuilder()); } } else { // TODO Discover if these older documents can ever hold Unicode Strings? // (We think not, because they seem to lack a Piece table) // TODO Build the Piece Descriptor properly // (We have to fake it, as they don't seem to have a proper Piece table) PieceDescriptor pd = new PieceDescriptor(new byte[] { 0, 0, 0, 0, 0, 127, 0, 0 }, 0); pd.FilePosition = _fib.GetFcMin(); // Generate a single Text Piece Table, with a single Text Piece // which covers all the (8 bit only) text in the file tpt = new TextPieceTable(); byte[] textData = new byte[_fib.GetFcMac() - _fib.GetFcMin()]; Array.Copy(_mainStream, _fib.GetFcMin(), textData, 0, textData.Length); TextPiece tp = new TextPiece( 0, textData.Length, textData, pd ); tpt.Add(tp); text.Append(tp.GetStringBuilder()); } _text = tpt.Text; // Now we can fetch the character and paragraph properties _cbt = new OldCHPBinTable( _mainStream, chpTableOffset, chpTableSize, _fib.GetFcMin(), tpt ); _pbt = new OldPAPBinTable( _mainStream, chpTableOffset, papTableSize, _fib.GetFcMin(), tpt ); _st = new OldSectionTable( _mainStream, chpTableOffset, sedTableSize, _fib.GetFcMin(), tpt ); }
public void Rebuild(ComplexFileTable complexFileTable) { long start = DateTime.Now.Ticks; if (complexFileTable != null) { SprmBuffer[] sprmBuffers = complexFileTable.GetGrpprls(); // adding CHPX from fast-saved SPRMs foreach (TextPiece textPiece in complexFileTable.GetTextPieceTable() .TextPieces) { PropertyModifier prm = textPiece.PieceDescriptor.Prm; if (!prm.IsComplex()) { continue; } int igrpprl = prm.GetIgrpprl(); if (igrpprl < 0 || igrpprl >= sprmBuffers.Length) { logger.Log(POILogger.WARN, textPiece + "'s PRM references to unknown grpprl"); continue; } bool hasChp = false; SprmBuffer sprmBuffer = sprmBuffers[igrpprl]; for (SprmIterator iterator = sprmBuffer.Iterator(); ; iterator .HasNext()) { SprmOperation sprmOperation = iterator.Next(); if (sprmOperation.Type == SprmOperation.TYPE_CHP) { hasChp = true; break; } } if (hasChp) { SprmBuffer newSprmBuffer; newSprmBuffer = (SprmBuffer)sprmBuffer.Clone(); CHPX chpx = new CHPX(textPiece.Start, textPiece.End, newSprmBuffer); _textRuns.Add(chpx); } } logger.Log(POILogger.DEBUG, "Merged with CHPX from complex file table in ", DateTime.Now.Ticks - start, " ms (", _textRuns.Count, " elements in total)"); start = DateTime.Now.Ticks; } List <CHPX> oldChpxSortedByStartPos = new List <CHPX>(_textRuns); oldChpxSortedByStartPos.Sort( (IComparer <CHPX>)PropertyNode.CHPXComparator.instance); logger.Log(POILogger.DEBUG, "CHPX sorted by start position in ", DateTime.Now.Ticks - start, " ms"); start = DateTime.Now.Ticks; Dictionary <CHPX, int> chpxToFileOrder = new Dictionary <CHPX, int>(); int counter = 0; foreach (CHPX chpx in _textRuns) { chpxToFileOrder.Add(chpx, counter++); } logger.Log(POILogger.DEBUG, "CHPX's order map created in ", DateTime.Now.Ticks - start, " ms"); start = DateTime.Now.Ticks; List <int> textRunsBoundariesList; List <int> textRunsBoundariesSet = new List <int>(); foreach (CHPX chpx in _textRuns) { textRunsBoundariesSet.Add(chpx.Start); textRunsBoundariesSet.Add(chpx.End); } textRunsBoundariesSet.Remove(0); textRunsBoundariesList = new List <int>( textRunsBoundariesSet); textRunsBoundariesList.Sort(); logger.Log(POILogger.DEBUG, "Texts CHPX boundaries collected in ", DateTime.Now.Ticks - start, " ms"); start = DateTime.Now.Ticks; List <CHPX> newChpxs = new List <CHPX>(); int lastTextRunStart = 0; foreach (int objBoundary in textRunsBoundariesList) { int boundary = objBoundary; int startInclusive = lastTextRunStart; int endExclusive = boundary; lastTextRunStart = endExclusive; int startPosition = BinarySearch(oldChpxSortedByStartPos, boundary); startPosition = Math.Abs(startPosition); while (startPosition >= oldChpxSortedByStartPos.Count) { startPosition--; } while (startPosition > 0 && oldChpxSortedByStartPos[startPosition].Start >= boundary) { startPosition--; } List <CHPX> chpxs = new List <CHPX>(); for (int c = startPosition; c < oldChpxSortedByStartPos.Count; c++) { CHPX chpx = oldChpxSortedByStartPos[c]; if (boundary < chpx.Start) { break; } int left = Math.Max(startInclusive, chpx.Start); int right = Math.Min(endExclusive, chpx.End); if (left < right) { chpxs.Add(chpx); } } if (chpxs.Count == 0) { logger.Log(POILogger.WARN, "Text piece [", startInclusive, "; ", endExclusive, ") has no CHPX. Creating new one."); // create it manually CHPX chpx = new CHPX(startInclusive, endExclusive, new SprmBuffer(0)); newChpxs.Add(chpx); continue; } if (chpxs.Count == 1) { // can we reuse existing? CHPX existing = chpxs[0]; if (existing.Start == startInclusive && existing.End == endExclusive) { newChpxs.Add(existing); continue; } } CHPXToFileComparer chpxFileOrderComparator = new CHPXToFileComparer(chpxToFileOrder); chpxs.Sort(chpxFileOrderComparator); SprmBuffer sprmBuffer = new SprmBuffer(0); foreach (CHPX chpx in chpxs) { sprmBuffer.Append(chpx.GetGrpprl(), 0); } CHPX newChpx = new CHPX(startInclusive, endExclusive, sprmBuffer); newChpxs.Add(newChpx); continue; } this._textRuns = new List <CHPX>(newChpxs); logger.Log(POILogger.DEBUG, "CHPX rebuilded in ", DateTime.Now.Ticks - start, " ms (", _textRuns.Count, " elements)"); start = DateTime.Now.Ticks; CHPX previous = null; for (int iterator = _textRuns.Count; iterator != 0;) { CHPX current = previous; previous = _textRuns[--iterator]; if (current == null) { continue; } if (previous.End == current.Start && Arrays .Equals(previous.GetGrpprl(), current.GetGrpprl())) { previous.End = current.End; _textRuns.Remove(current); continue; } previous = current; } logger.Log(POILogger.DEBUG, "CHPX compacted in ", DateTime.Now.Ticks - start, " ms (", _textRuns.Count, " elements)"); }
/// <summary> /// Initializes a new instance of the <see cref="HWPFDocument"/> class. /// </summary> /// <param name="directory">The directory.</param> public HWPFDocument(DirectoryNode directory) : base(directory) { _endnotes = new NotesImpl(_endnotesTables); _footnotes = new NotesImpl(_footnotesTables); // Load the main stream and FIB // Also handles HPSF bits // Do the CP Split _cpSplit = new CPSplitCalculator(_fib); // Is this document too old for us? if (_fib.GetNFib() < 106) { throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?"); } // use the fib to determine the name of the table stream. String name = "0Table"; if (_fib.IsFWhichTblStm()) { name = "1Table"; } // Grab the table stream. DocumentEntry tableProps; try { tableProps = (DocumentEntry)directory.GetEntry(name); } catch (FileNotFoundException) { throw new InvalidOperationException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); } // read in the table stream. _tableStream = new byte[tableProps.Size]; directory.CreatePOIFSDocumentReader(name).Read(_tableStream); _fib.FillVariableFields(_mainStream, _tableStream); // read in the data stream. try { DocumentEntry dataProps = (DocumentEntry)directory.GetEntry("Data"); _dataStream = new byte[dataProps.Size]; directory.CreatePOIFSDocumentReader("Data").Read(_dataStream); } catch (FileNotFoundException) { _dataStream = new byte[0]; } // Get the cp of the start of text in the main stream // The latest spec doc says this is always zero! int fcMin = 0; //fcMin = _fib.GetFcMin() // Start to load up our standard structures. _dop = new DocumentProperties(_tableStream, _fib.GetFcDop()); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.GetFcClx(), fcMin); TextPieceTable _tpt = _cft.GetTextPieceTable(); // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.GetFcPlcfbteChpx(), _fib.GetLcbPlcfbteChpx(), _tpt); _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.GetFcPlcfbtePapx(), _fib.GetLcbPlcfbtePapx(), _tpt); _text = _tpt.Text; /* * in this mode we preserving PAPX/CHPX structure from file, so text may * miss from output, and text order may be corrupted */ bool preserveBinTables = false; try { preserveBinTables = Boolean.Parse( ConfigurationManager.AppSettings[PROPERTY_PRESERVE_BIN_TABLES]); } catch (Exception) { // ignore; } if (!preserveBinTables) { _cbt.Rebuild(_cft); _pbt.Rebuild(_text, _cft); } /* * Property to disable text rebuilding. In this mode changing the text * will lead to unpredictable behavior */ bool preserveTextTable = false; try { preserveTextTable = Boolean.Parse( ConfigurationManager.AppSettings[PROPERTY_PRESERVE_TEXT_TABLE]); } catch (Exception) { // ignore; } if (!preserveTextTable) { _cft = new ComplexFileTable(); _tpt = _cft.GetTextPieceTable(); TextPiece textPiece = new SinglentonTextPiece(_text); _tpt.Add(textPiece); _text = textPiece.GetStringBuilder(); } // Read FSPA and Escher information // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); _fspaHeaders = new FSPATable(_tableStream, _fib, FSPADocumentPart.HEADER); _fspaMain = new FSPATable(_tableStream, _fib, FSPADocumentPart.MAIN); if (_fib.GetFcDggInfo() != 0) { _dgg = new EscherRecordHolder(_tableStream, _fib.GetFcDggInfo(), _fib.GetLcbDggInfo()); } else { _dgg = new EscherRecordHolder(); } // read in the pictures stream _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspa, _dgg); // And the art shapes stream _officeArts = new ShapesTable(_tableStream, _fib); // And escher pictures _officeDrawingsHeaders = new OfficeDrawingsImpl(_fspaHeaders, _dgg, _mainStream); _officeDrawingsMain = new OfficeDrawingsImpl(_fspaMain, _dgg, _mainStream); _st = new SectionTable(_mainStream, _tableStream, _fib.GetFcPlcfsed(), _fib.GetLcbPlcfsed(), fcMin, _tpt, _cpSplit); _ss = new StyleSheet(_tableStream, _fib.GetFcStshf()); _ft = new FontTable(_tableStream, _fib.GetFcSttbfffn(), _fib.GetLcbSttbfffn()); int listOffset = _fib.GetFcPlcfLst(); int lfoOffset = _fib.GetFcPlfLfo(); if (listOffset != 0 && _fib.GetLcbPlcfLst() != 0) { _lt = new ListTables(_tableStream, _fib.GetFcPlcfLst(), _fib.GetFcPlfLfo()); } int sbtOffset = _fib.GetFcSttbSavedBy(); int sbtLength = _fib.GetLcbSttbSavedBy(); if (sbtOffset != 0 && sbtLength != 0) { _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength); } int rmarkOffset = _fib.GetFcSttbfRMark(); int rmarkLength = _fib.GetLcbSttbfRMark(); if (rmarkOffset != 0 && rmarkLength != 0) { _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength); } _bookmarksTables = new BookmarksTables(_tableStream, _fib); _bookmarks = new BookmarksImpl(_bookmarksTables); _endnotesTables = new NotesTables(NoteType.ENDNOTE, _tableStream, _fib); _endnotes = new NotesImpl(_endnotesTables); _footnotesTables = new NotesTables(NoteType.FOOTNOTE, _tableStream, _fib); _footnotes = new NotesImpl(_footnotesTables); _fieldsTables = new FieldsTables(_tableStream, _fib); _fields = new FieldsImpl(_fieldsTables); }
public void Rebuild(StringBuilder docText, ComplexFileTable complexFileTable) { long start = DateTime.Now.Ticks; if (complexFileTable != null) { SprmBuffer[] sprmBuffers = complexFileTable.GetGrpprls(); // adding PAPX from fast-saved SPRMs foreach (TextPiece textPiece in complexFileTable.GetTextPieceTable() .TextPieces) { PropertyModifier prm = textPiece.PieceDescriptor.Prm; if (!prm.IsComplex()) { continue; } int igrpprl = prm.GetIgrpprl(); if (igrpprl < 0 || igrpprl >= sprmBuffers.Length) { logger.Log(POILogger.WARN, textPiece + "'s PRM references to unknown grpprl"); continue; } bool hasPap = false; SprmBuffer sprmBuffer = sprmBuffers[igrpprl]; for (SprmIterator iterator = sprmBuffer.Iterator(); iterator .HasNext();) { SprmOperation sprmOperation = iterator.Next(); if (sprmOperation.Type == SprmOperation.TYPE_PAP) { hasPap = true; break; } } if (hasPap) { SprmBuffer newSprmBuffer = new SprmBuffer(2); newSprmBuffer.Append(sprmBuffer.ToByteArray()); PAPX papx = new PAPX(textPiece.Start, textPiece.End, newSprmBuffer); _paragraphs.Add(papx); } } logger.Log(POILogger.DEBUG, "Merged (?) with PAPX from complex file table in ", DateTime.Now.Ticks - start, " ms (", _paragraphs.Count, " elements in total)"); start = DateTime.Now.Ticks; } List <PAPX> oldPapxSortedByEndPos = new List <PAPX>(_paragraphs); oldPapxSortedByEndPos.Sort( (IComparer <PAPX>)PropertyNode.PAPXComparator.instance); logger.Log(POILogger.DEBUG, "PAPX sorted by end position in ", DateTime.Now.Ticks - start, " ms"); start = DateTime.Now.Ticks; Dictionary <PAPX, int> papxToFileOrder = new Dictionary <PAPX, int>(); int counter = 0; foreach (PAPX papx in _paragraphs) { papxToFileOrder[papx] = counter++; } logger.Log(POILogger.DEBUG, "PAPX's order map created in ", DateTime.Now.Ticks - start, " ms"); start = DateTime.Now.Ticks; List <PAPX> newPapxs = new List <PAPX>(); int lastParStart = 0; int lastPapxIndex = 0; for (int charIndex = 0; charIndex < docText.Length; charIndex++) { char c = docText[charIndex]; if (c != 13 && c != 7 && c != 12) { continue; } int startInclusive = lastParStart; int endExclusive = charIndex + 1; bool broken = false; List <PAPX> papxs = new List <PAPX>(); for (int papxIndex = lastPapxIndex; papxIndex < oldPapxSortedByEndPos .Count; papxIndex++) { broken = false; PAPX papx = oldPapxSortedByEndPos[papxIndex]; if (papx.End - 1 > charIndex) { lastPapxIndex = papxIndex; broken = true; break; } papxs.Add(papx); } if (!broken) { lastPapxIndex = oldPapxSortedByEndPos.Count - 1; } if (papxs.Count == 0) { logger.Log(POILogger.WARN, "Paragraph [", startInclusive, "; ", endExclusive, ") has no PAPX. Creating new one."); // create it manually PAPX papx = new PAPX(startInclusive, endExclusive, new SprmBuffer(2)); newPapxs.Add(papx); lastParStart = endExclusive; continue; } if (papxs.Count == 1) { // can we reuse existing? PAPX existing = papxs[0]; if (existing.Start == startInclusive && existing.End == endExclusive) { newPapxs.Add(existing); lastParStart = endExclusive; continue; } } PAPXToFileComparer papxFileOrderComparator = new PAPXToFileComparer(papxToFileOrder); // restore file order of PAPX papxs.Sort(papxFileOrderComparator); SprmBuffer sprmBuffer = null; foreach (PAPX papx in papxs) { if (sprmBuffer == null) { sprmBuffer = (SprmBuffer)papx.GetSprmBuf().Clone(); } else { sprmBuffer.Append(papx.GetGrpprl(), 2); } } PAPX newPapx = new PAPX(startInclusive, endExclusive, sprmBuffer); newPapxs.Add(newPapx); lastParStart = endExclusive; continue; } this._paragraphs = new List <PAPX>(newPapxs); logger.Log(POILogger.DEBUG, "PAPX rebuilded from document text in ", DateTime.Now.Ticks - start, " ms (", _paragraphs.Count, " elements)"); start = DateTime.Now.Ticks; }