public override void RemoveMetadata(TempFileForActions tempFile, CleanActionPropertySet cleanProperties)
        {
            try
            {
                m_elementsToClean = GetRationalizedElementsToClean(GetListOfEnabledElementsToClean(cleanProperties), CleanActionPropertySet.TargetApplication.Word);
                List<Exclusion> listExclusion = GetListOfExcludedElements(cleanProperties);

                using (BinaryData bData = new BinaryData(tempFile.GetMemoryStream()))
                {
                    using (DocxDocumentReader reader = new DocxDocumentReader(bData))
                    {
                        using (Stream str = GetOutputStream())
                        {
                            reader.CleanTo(str, m_elementsToClean, listExclusion);
                            File.Copy(InterimTempFileName, tempFile.TempFile, true);
                        }
                    }
                }
            }
            catch (System.Exception ex)
            {
                Logger.LogError("WordX cleaning failed");
				Logger.LogError(ex);
                throw;
            }
            finally
            {
                CleanUp();
            }
        }
        private void ExecuteUsingOfficeOpenXml(TempFileForActions tempFile, CleanActionPropertySet elementsToClean)
        {
            try
            {
                m_elementsToClean = GetElementsNotRemovedByDomClean(GetListOfEnabledElementsToClean(elementsToClean));

                using (BinaryData bData = new BinaryData(tempFile.GetMemoryStream()))
                {
                    using (DocxDocumentReader reader = new DocxDocumentReader(bData))
                    {
                        using (Stream str = GetOutputStream())
                        {
                            List<Exclusion> excludedElements = GetListOfExcludedElements(elementsToClean);
                            reader.CleanTo(str, m_elementsToClean, excludedElements);
                            File.Copy(InterimTempFileName, tempFile.TempFile, true);
                        }
                    }
                }
            }
            catch (System.Exception ex)
            {
				Logger.LogError(ex);
				throw;
            }
            finally
            {
                CleanUp();
            }
        }
Example #3
0
        public override void Execute(WorkItem workItem)
        {
            try
            {
                switch (workItem.Extension)
                {
                    case "doc":
                        using (DocxDocumentReader reader = new DocxDocumentReader(workItem.ConvertedFileName))
                        {
                            workItem.discoveryTimer.Start();
                            DocumentText results = reader.Read();
                            workItem.DiscoveredText = results;
                            workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length;
                        }
                        break;
                    case "ppt":
                        using (PptxDocumentReader reader = new PptxDocumentReader(workItem.ConvertedFileName))
                        {
                            workItem.discoveryTimer.Start();
                            DocumentText results = reader.Read();
                            workItem.DiscoveredText = results;
                            workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length;
                        }
                        break;
                    case "xls":
                        using (XlsxDocumentReader reader = new XlsxDocumentReader(workItem.ConvertedFileName))
                        {
                            workItem.discoveryTimer.Start();
                            DocumentText results = reader.Read();
                            workItem.DiscoveredText = results;
                            workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length;
                        }
                        break;
                    default:
                        throw new Exception("File extension not expected : " + workItem.Extension);
                }
            }
            catch (Exception e)
            {
                workItem.FailureMode = FailureMode.DiscoveryFailed;
                workItem.ShouldAbort = true;

                throw e;
            }
            finally
            {
                workItem.discoveryTimer.Stop();
            }
        }
        public override bool VerifyFile(TempFileForActions tempFile)
        {
            try
            {
                using (BinaryData bData = new BinaryData(tempFile.GetMemoryStream()))
                {
                    using (DocxDocumentReader reader = new DocxDocumentReader(bData))
                    {
                        reader.Read();
                    }
                }
                return true;
            }
            catch (System.Exception ex)
            {
                Logger.LogError("WordX Verification failed");
				Logger.LogError(ex);
            }
            return false;
        }
		/// <summary>
		/// Original incarnation of lightspeed clean method. Left for comparison. There is at least one document that gets corrupted by the
		/// lightspeed clean user action as opposed to the method below. see Rally DE8912 in Professional & Deltaview project.
		/// </summary>
		/// <param name="listContentTypes"></param>
		private void DoLightSpeedClean(List<ContentType> listContentTypes)
		{
			System.Diagnostics.Stopwatch watch = new System.Diagnostics.Stopwatch();
			watch.Start();
			switch (m_ft)
			{
				case Workshare.Policy.FileType.WordDocument:
					using (WordDocumentReader Word2003Reader1 = new WordDocumentReader(m_sFileForBinClean, true))
					{
						Word2003Reader1.Clean(listContentTypes);
					}
					break;

				case Workshare.Policy.FileType.ExcelSheet:
					using (ExcelDocumentReader Excel2003Reader1 = new ExcelDocumentReader(m_sFileForBinClean, true))
					{
						Excel2003Reader1.Clean(listContentTypes);
					}
					break;

				case Workshare.Policy.FileType.PowerPoint:
					using (Workshare.FCS.Lite.PptDocumentReader Ppt2003Reader1 = new PptDocumentReader(m_sFileForBinClean, true))
					{
						Ppt2003Reader1.Clean(listContentTypes);
					}
					break;


				case Workshare.Policy.FileType.WordDocumentX:
				case Workshare.Policy.FileType.WordDocumentMacroX:
				case Workshare.Policy.FileType.WordDocumentTemplateX:
				case Workshare.Policy.FileType.WordDocumentMacroTemplateX:
					{
						string outFileName = System.IO.Path.GetTempFileName();
						using (DocxDocumentReader Word2007Reader1 = new DocxDocumentReader(m_sFileForBinClean))
						{
							using (Stream outStr = File.Open(outFileName, FileMode.Create))
							{

								Word2007Reader1.CleanTo(outStr, listContentTypes);
							}
						}
						File.Copy(outFileName, m_sFileForBinClean, true);
						File.Delete(outFileName);
						break;
					}

				case Workshare.Policy.FileType.ExcelSheetX:
				case Workshare.Policy.FileType.ExcelSheetMacroX:
				case Workshare.Policy.FileType.ExcelSheetTemplateX:
				case Workshare.Policy.FileType.ExcelSheetMacroTemplateX:
					{
						string outFileName = System.IO.Path.GetTempFileName();
						using (XlsxDocumentReader Excel2007Reader1 = new XlsxDocumentReader(m_sFileForBinClean))
						{
							using (Stream outStr = File.Open(outFileName, FileMode.Create))
							{

								Excel2007Reader1.CleanTo(outStr, listContentTypes);
							}
						}
						File.Copy(outFileName, m_sFileForBinClean, true);
						File.Delete(outFileName);
						break;
					}

				case Workshare.Policy.FileType.PowerPointX:
				case Workshare.Policy.FileType.PowerPointMacroX:
				case Workshare.Policy.FileType.PowerPointTemplateX:
				case Workshare.Policy.FileType.PowerPointMacroTemplateX:
				case Workshare.Policy.FileType.PowerPointShowX:
				case Workshare.Policy.FileType.PowerPointMacroShowX:
					{
						string outFileName = System.IO.Path.GetTempFileName();
						using (PptxDocumentReader Ppt2007Reader1 = new PptxDocumentReader(m_sFileForBinClean))
						{
							using (Stream outStr = File.Open(outFileName, FileMode.Create))
							{

								Ppt2007Reader1.CleanTo(outStr, listContentTypes);
							}
						}
						File.Copy(outFileName, m_sFileForBinClean, true);
						File.Delete(outFileName);
						break;
					}

				default:
					break;
			}
			watch.Stop();
			m_binCleanTime = watch.Elapsed.TotalSeconds;
		}
		DocumentText DiscoverDocument(string sFilename)
		{
			switch (m_ft)
			{
				case Workshare.Policy.FileType.WordDocument:
					using (WordDocumentReader Word2003Reader1 = new WordDocumentReader(sFilename, false))
					{
						return Word2003Reader1.Read();
					}

				case Workshare.Policy.FileType.ExcelSheet:
					using (ExcelDocumentReader Excel2003Reader1 = new ExcelDocumentReader(sFilename, false))
					{
						return Excel2003Reader1.Read();
					}

				case Workshare.Policy.FileType.PowerPoint:
					using (Workshare.FCS.Lite.PptDocumentReader Ppt2003Reader1 = new PptDocumentReader(sFilename, false))
					{
						return Ppt2003Reader1.Read();
					}

				case Workshare.Policy.FileType.WordDocumentX:
				case Workshare.Policy.FileType.WordDocumentMacroX:
				case Workshare.Policy.FileType.WordDocumentTemplateX:
				case Workshare.Policy.FileType.WordDocumentMacroTemplateX:
					using (DocxDocumentReader Word2007Reader1 = new DocxDocumentReader(sFilename))
					{
						return Word2007Reader1.Read();
					}

				case Workshare.Policy.FileType.ExcelSheetX:
				case Workshare.Policy.FileType.ExcelSheetMacroX:
				case Workshare.Policy.FileType.ExcelSheetTemplateX:
				case Workshare.Policy.FileType.ExcelSheetMacroTemplateX:
					using (XlsxDocumentReader Excel2007Reader1 = new XlsxDocumentReader(sFilename))
					{
						return Excel2007Reader1.Read();
					}

				case Workshare.Policy.FileType.PowerPointX:
				case Workshare.Policy.FileType.PowerPointMacroX:
				case Workshare.Policy.FileType.PowerPointTemplateX:
				case Workshare.Policy.FileType.PowerPointMacroTemplateX:
				case Workshare.Policy.FileType.PowerPointShowX:
				case Workshare.Policy.FileType.PowerPointMacroShowX:
					using (PptxDocumentReader Ppt2007Reader1 = new PptxDocumentReader(sFilename))
					{
						return Ppt2007Reader1.Read();
					}
				default:
					break;
			}
			return null;
		}
        public override void Execute(WorkItem workItem)
        {
            try
            {
                switch (workItem.Extension)
                {
                    case "doc":
                        using (DocxDocumentReader reader = new DocxDocumentReader(workItem.CleanedFileName))
                        {
                            DocumentText results = reader.Read();
                            //workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length;
                            foreach (IAbstractTextType tt in results.GetTextTypes())
                            {
                                if (tt.GetContentType() == ContentType.WorkshareProperty)
                                    continue; // We explicitly never clean these

                                if (tt.GetChildCount() > 0 && (tt.GetContentType() != ContentType.Paragraph))
                                {
                                    workItem.Info = DumpTextType(tt);
                                    workItem.ContentType = tt.GetContentType();
                                    throw new Exception("Unexpected content type found in cleaned doc");
                                }
                            }
                        }
                        break;
                    case "ppt":
                        using (PptxDocumentReader reader = new PptxDocumentReader(workItem.CleanedFileName))
                        {
                            DocumentText results = reader.Read();
                            //workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length;
                            foreach (IAbstractTextType tt in results.GetTextTypes())
                            {
                                if (tt.GetContentType() == ContentType.TextBox)
                                    continue;
                                if (tt.GetContentType() == ContentType.Paragraph)
                                    continue;

                                if (tt.GetChildCount() > 0 )
                                {
                                    workItem.Info = DumpTextType(tt);
                                    workItem.ContentType = tt.GetContentType();
                                    throw new Exception("Unexpected content type found in cleaned ppt");
                                }
                            }
                        }
                        break;

                    case "xls":
                        using (XlsxDocumentReader reader = new XlsxDocumentReader(workItem.CleanedFileName))
                        {
                            DocumentText results = reader.Read();
                            //workItem.FileSize += new FileInfo(workItem.ConvertedFileName).Length;
                            foreach (IAbstractTextType tt in results.GetTextTypes())
                            {
                                if (tt.GetContentType() == ContentType.RoutingSlip)
                                    continue;
                                
                                if (tt.GetChildCount() > 0 && (tt.GetContentType() != ContentType.CellText))
                                {
                                    workItem.Info = DumpTextType(tt);
                                    workItem.ContentType = tt.GetContentType();
                                    throw new Exception("Unexpected content type found in cleaned xls");
                                }
                            }
                        }
                        break;
                    default:
                        throw new Exception("File extension not expected : " + workItem.Extension);
                }
            }
            catch (Exception e)
            {
                workItem.FailureMode = FailureMode.ThingsNotCleaned;
                throw e;
            }
        }
Example #8
0
        /// <summary>
        /// </summary>
        /// <returns>A FCS.Lite.DocumentText object</returns>
        public FCS.Lite.Base.DocumentText AdvancedRead()
        {
            if (m_docText == null)
            {
                FCS.Lite.Base.DocumentReader docReader;

                switch (m_fileData.FileType)
                {
                    case FileType.WordDocument:
                        docReader = new WordDocumentReader(m_fileData.BinaryFileData);
                        break;
                    case FileType.PowerPoint:
                        docReader = new PptDocumentReader(m_fileData.BinaryFileData);
                        break;
                    case FileType.ExcelSheet:
                        docReader = new ExcelDocumentReader(m_fileData.BinaryFileData);
                        break;
                    case FileType.WordDocumentX:
                    case FileType.WordDocumentMacroX:
                    case FileType.WordDocumentTemplateX:
                    case FileType.WordDocumentMacroTemplateX:
                        docReader = new DocxDocumentReader(m_fileData.BinaryFileData);
                        break;
                    case FileType.PowerPointX:
                    case FileType.PowerPointMacroX:
                    case FileType.PowerPointTemplateX:
                    case FileType.PowerPointMacroTemplateX:
					case FileType.PowerPointShowX:
					case FileType.PowerPointMacroShowX:
                        docReader = new PptxDocumentReader(m_fileData.BinaryFileData);
                        break;
                    case FileType.ExcelSheetX:
                    case FileType.ExcelSheetMacroX:
                    case FileType.ExcelSheetTemplateX:
					case FileType.ExcelSheetMacroTemplateX:
						docReader = new XlsxDocumentReader(m_fileData.BinaryFileData);
						break;
					case FileType.PDFDocument:
						docReader = new PdfDocumentReader(m_fileData.BinaryFileData);
						break;
					default:
                        return m_docText;
                }

                using (docReader)
                {
                    m_docText = docReader.Read();
                }
            }
            return m_docText;
        }