public override void RemoveMetadata(TempFileForActions tempFile, CleanActionPropertySet cleanProperties) { try { m_elementsToClean = GetListOfEnabledElementsToClean(cleanProperties); List<Exclusion> listExclusion = GetListOfExcludedElements(cleanProperties); using (BinaryData bData = new BinaryData(tempFile.GetMemoryStream())) { using (PptxDocumentReader reader = new PptxDocumentReader(bData)) { using (Stream str = GetOutputStream()) { reader.CleanTo(str, m_elementsToClean, listExclusion); File.Copy(InterimTempFileName, tempFile.TempFile, true); } } } } catch (System.Exception ex) { Logger.LogError("PowerpointX cleaning failed"); Logger.LogError(ex); throw; } finally { CleanUp(); } }
private void ExecuteUsingOfficeOpenXml(TempFileForActions tempFile, CleanActionPropertySet elementsToClean) { try { m_elementsToClean = GetElementsNotRemovedByDomClean(GetListOfEnabledElementsToClean(elementsToClean)); using (BinaryData bData = new BinaryData(tempFile.GetMemoryStream())) { using (PptxDocumentReader reader = new PptxDocumentReader(bData)) { using (Stream str = GetOutputStream()) { reader.CleanTo(str, m_elementsToClean); File.Copy(InterimTempFileName, tempFile.TempFile, true); } } } } catch (System.Exception ex) { Logger.LogError(ex); throw; } finally { CleanUp(); } }
public override void Execute(WorkItem workItem) { try { switch (workItem.Extension) { case "doc": using (DocxDocumentReader reader = new DocxDocumentReader(workItem.ConvertedFileName)) { workItem.discoveryTimer.Start(); DocumentText results = reader.Read(); workItem.DiscoveredText = results; workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length; } break; case "ppt": using (PptxDocumentReader reader = new PptxDocumentReader(workItem.ConvertedFileName)) { workItem.discoveryTimer.Start(); DocumentText results = reader.Read(); workItem.DiscoveredText = results; workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length; } break; case "xls": using (XlsxDocumentReader reader = new XlsxDocumentReader(workItem.ConvertedFileName)) { workItem.discoveryTimer.Start(); DocumentText results = reader.Read(); workItem.DiscoveredText = results; workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length; } break; default: throw new Exception("File extension not expected : " + workItem.Extension); } } catch (Exception e) { workItem.FailureMode = FailureMode.DiscoveryFailed; workItem.ShouldAbort = true; throw e; } finally { workItem.discoveryTimer.Stop(); } }
public override bool VerifyFile(TempFileForActions tempFile) { try { using (BinaryData bData = new BinaryData(tempFile.GetMemoryStream())) { PptxDocumentReader reader = new PptxDocumentReader(bData); reader.Read(); } return true; } catch (System.Exception ex) { Logger.LogError("PowerpointX Verification failed"); Logger.LogError(ex); } return false; }
/// <summary> /// Original incarnation of lightspeed clean method. Left for comparison. There is at least one document that gets corrupted by the /// lightspeed clean user action as opposed to the method below. see Rally DE8912 in Professional & Deltaview project. /// </summary> /// <param name="listContentTypes"></param> private void DoLightSpeedClean(List<ContentType> listContentTypes) { System.Diagnostics.Stopwatch watch = new System.Diagnostics.Stopwatch(); watch.Start(); switch (m_ft) { case Workshare.Policy.FileType.WordDocument: using (WordDocumentReader Word2003Reader1 = new WordDocumentReader(m_sFileForBinClean, true)) { Word2003Reader1.Clean(listContentTypes); } break; case Workshare.Policy.FileType.ExcelSheet: using (ExcelDocumentReader Excel2003Reader1 = new ExcelDocumentReader(m_sFileForBinClean, true)) { Excel2003Reader1.Clean(listContentTypes); } break; case Workshare.Policy.FileType.PowerPoint: using (Workshare.FCS.Lite.PptDocumentReader Ppt2003Reader1 = new PptDocumentReader(m_sFileForBinClean, true)) { Ppt2003Reader1.Clean(listContentTypes); } break; case Workshare.Policy.FileType.WordDocumentX: case Workshare.Policy.FileType.WordDocumentMacroX: case Workshare.Policy.FileType.WordDocumentTemplateX: case Workshare.Policy.FileType.WordDocumentMacroTemplateX: { string outFileName = System.IO.Path.GetTempFileName(); using (DocxDocumentReader Word2007Reader1 = new DocxDocumentReader(m_sFileForBinClean)) { using (Stream outStr = File.Open(outFileName, FileMode.Create)) { Word2007Reader1.CleanTo(outStr, listContentTypes); } } File.Copy(outFileName, m_sFileForBinClean, true); File.Delete(outFileName); break; } case Workshare.Policy.FileType.ExcelSheetX: case Workshare.Policy.FileType.ExcelSheetMacroX: case Workshare.Policy.FileType.ExcelSheetTemplateX: case Workshare.Policy.FileType.ExcelSheetMacroTemplateX: { string outFileName = System.IO.Path.GetTempFileName(); using (XlsxDocumentReader Excel2007Reader1 = new XlsxDocumentReader(m_sFileForBinClean)) { using (Stream outStr = File.Open(outFileName, FileMode.Create)) { Excel2007Reader1.CleanTo(outStr, listContentTypes); } } File.Copy(outFileName, m_sFileForBinClean, true); File.Delete(outFileName); break; } case Workshare.Policy.FileType.PowerPointX: case Workshare.Policy.FileType.PowerPointMacroX: case Workshare.Policy.FileType.PowerPointTemplateX: case Workshare.Policy.FileType.PowerPointMacroTemplateX: case Workshare.Policy.FileType.PowerPointShowX: case Workshare.Policy.FileType.PowerPointMacroShowX: { string outFileName = System.IO.Path.GetTempFileName(); using (PptxDocumentReader Ppt2007Reader1 = new PptxDocumentReader(m_sFileForBinClean)) { using (Stream outStr = File.Open(outFileName, FileMode.Create)) { Ppt2007Reader1.CleanTo(outStr, listContentTypes); } } File.Copy(outFileName, m_sFileForBinClean, true); File.Delete(outFileName); break; } default: break; } watch.Stop(); m_binCleanTime = watch.Elapsed.TotalSeconds; }
DocumentText DiscoverDocument(string sFilename) { switch (m_ft) { case Workshare.Policy.FileType.WordDocument: using (WordDocumentReader Word2003Reader1 = new WordDocumentReader(sFilename, false)) { return Word2003Reader1.Read(); } case Workshare.Policy.FileType.ExcelSheet: using (ExcelDocumentReader Excel2003Reader1 = new ExcelDocumentReader(sFilename, false)) { return Excel2003Reader1.Read(); } case Workshare.Policy.FileType.PowerPoint: using (Workshare.FCS.Lite.PptDocumentReader Ppt2003Reader1 = new PptDocumentReader(sFilename, false)) { return Ppt2003Reader1.Read(); } case Workshare.Policy.FileType.WordDocumentX: case Workshare.Policy.FileType.WordDocumentMacroX: case Workshare.Policy.FileType.WordDocumentTemplateX: case Workshare.Policy.FileType.WordDocumentMacroTemplateX: using (DocxDocumentReader Word2007Reader1 = new DocxDocumentReader(sFilename)) { return Word2007Reader1.Read(); } case Workshare.Policy.FileType.ExcelSheetX: case Workshare.Policy.FileType.ExcelSheetMacroX: case Workshare.Policy.FileType.ExcelSheetTemplateX: case Workshare.Policy.FileType.ExcelSheetMacroTemplateX: using (XlsxDocumentReader Excel2007Reader1 = new XlsxDocumentReader(sFilename)) { return Excel2007Reader1.Read(); } case Workshare.Policy.FileType.PowerPointX: case Workshare.Policy.FileType.PowerPointMacroX: case Workshare.Policy.FileType.PowerPointTemplateX: case Workshare.Policy.FileType.PowerPointMacroTemplateX: case Workshare.Policy.FileType.PowerPointShowX: case Workshare.Policy.FileType.PowerPointMacroShowX: using (PptxDocumentReader Ppt2007Reader1 = new PptxDocumentReader(sFilename)) { return Ppt2007Reader1.Read(); } default: break; } return null; }
public override void Execute(WorkItem workItem) { try { switch (workItem.Extension) { case "doc": using (DocxDocumentReader reader = new DocxDocumentReader(workItem.CleanedFileName)) { DocumentText results = reader.Read(); //workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length; foreach (IAbstractTextType tt in results.GetTextTypes()) { if (tt.GetContentType() == ContentType.WorkshareProperty) continue; // We explicitly never clean these if (tt.GetChildCount() > 0 && (tt.GetContentType() != ContentType.Paragraph)) { workItem.Info = DumpTextType(tt); workItem.ContentType = tt.GetContentType(); throw new Exception("Unexpected content type found in cleaned doc"); } } } break; case "ppt": using (PptxDocumentReader reader = new PptxDocumentReader(workItem.CleanedFileName)) { DocumentText results = reader.Read(); //workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length; foreach (IAbstractTextType tt in results.GetTextTypes()) { if (tt.GetContentType() == ContentType.TextBox) continue; if (tt.GetContentType() == ContentType.Paragraph) continue; if (tt.GetChildCount() > 0 ) { workItem.Info = DumpTextType(tt); workItem.ContentType = tt.GetContentType(); throw new Exception("Unexpected content type found in cleaned ppt"); } } } break; case "xls": using (XlsxDocumentReader reader = new XlsxDocumentReader(workItem.CleanedFileName)) { DocumentText results = reader.Read(); //workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length; foreach (IAbstractTextType tt in results.GetTextTypes()) { if (tt.GetContentType() == ContentType.RoutingSlip) continue; if (tt.GetChildCount() > 0 && (tt.GetContentType() != ContentType.CellText)) { workItem.Info = DumpTextType(tt); workItem.ContentType = tt.GetContentType(); throw new Exception("Unexpected content type found in cleaned xls"); } } } break; default: throw new Exception("File extension not expected : " + workItem.Extension); } } catch (Exception e) { workItem.FailureMode = FailureMode.ThingsNotCleaned; throw e; } }
/// <summary> /// </summary> /// <returns>A FCS.Lite.DocumentText object</returns> public FCS.Lite.Base.DocumentText AdvancedRead() { if (m_docText == null) { FCS.Lite.Base.DocumentReader docReader; switch (m_fileData.FileType) { case FileType.WordDocument: docReader = new WordDocumentReader(m_fileData.BinaryFileData); break; case FileType.PowerPoint: docReader = new PptDocumentReader(m_fileData.BinaryFileData); break; case FileType.ExcelSheet: docReader = new ExcelDocumentReader(m_fileData.BinaryFileData); break; case FileType.WordDocumentX: case FileType.WordDocumentMacroX: case FileType.WordDocumentTemplateX: case FileType.WordDocumentMacroTemplateX: docReader = new DocxDocumentReader(m_fileData.BinaryFileData); break; case FileType.PowerPointX: case FileType.PowerPointMacroX: case FileType.PowerPointTemplateX: case FileType.PowerPointMacroTemplateX: case FileType.PowerPointShowX: case FileType.PowerPointMacroShowX: docReader = new PptxDocumentReader(m_fileData.BinaryFileData); break; case FileType.ExcelSheetX: case FileType.ExcelSheetMacroX: case FileType.ExcelSheetTemplateX: case FileType.ExcelSheetMacroTemplateX: docReader = new XlsxDocumentReader(m_fileData.BinaryFileData); break; case FileType.PDFDocument: docReader = new PdfDocumentReader(m_fileData.BinaryFileData); break; default: return m_docText; } using (docReader) { m_docText = docReader.Read(); } } return m_docText; }