public void TestXMLPartFilter() { XmlPartFilter xpf = new XmlPartFilter(new CommonNamespaces(OpenXmlFormat.Transitional)); xpf.Triggers.AddRange(DocxMetadataDefinitions.HiddenDocumentText); xpf.Triggers.AddRange(DocxMetadataDefinitions.SmartTags); using (Stream sIn = File.Open(TESTFILE_DIR + "document1.xml", FileMode.Open)) { DocumentText docText = new DocumentText(); Stream sOut = xpf.ProcessPart(sIn, docText, null, DocumentProcessingActions.DiscoverAndClean); Assert.IsNotNull(sOut, "output stream null unexpectedly"); Assert.AreEqual(1, docText.GetTextTypes(ContentType.HiddenText).Count, "expected a hidden text type"); Assert.AreEqual(1, docText.GetTextTypes(ContentType.HiddenText).Count, "expected a smart tags type"); } }
public void TestDiscoverCustomPropertiesXlsx() { XmlPartFilter xpf = new XmlPartFilter(new CommonNamespaces()); xpf.Triggers.AddRange(GenericMetadataDefinitions.CustomProperties); xpf.Triggers.AddRange(GenericMetadataDefinitions.WorkshareProperties); using (Stream sIn = File.Open(TESTFILE_DIR + "xlsx_customonly.xml", FileMode.Open)) { DocumentText docText = new DocumentText(); Stream sOut = xpf.ProcessPart(sIn, docText, null, DocumentProcessingActions.DiscoverAndClean); Assert.IsNotNull(sOut, "output stream null unexpectedly"); Assert.AreEqual(0, docText.GetTextTypes(ContentType.WorkshareProperty).Count, "expected no workshareproperty text type"); Assert.AreEqual(1, docText.GetTextTypes(ContentType.CustomProperty).Count, "expected a customproperty type"); } }
public override Stream ProcessPart(Stream partData, DocumentText discoveryText, RelatedPartProvider relPartProvider, DocumentProcessingActions action) { //this is for the discovery on open documents - the copy loses all other macro information except this items //specifying macro content - there is a single vbaproject.bin file containing all macros //so far this appears to be valid if (DocumentProcessor.ActionIncludesCleaning(action)) return null; if (action == DocumentProcessingActions.PassThrough) return partData; if (m_bInterestedInMacros) { List<IAbstractTextType> ttypes = discoveryText.GetTextTypes(ContentType.Macro); TextType macro = null; if (ttypes.Count > 0) { macro = (TextType)ttypes[0]; } else { macro = new TextType(ContentType.Macro); discoveryText.AddTextType(macro); } TextNode macroNode = new TextNode(); NodeInfo ni = new NodeInfo(); ni.name = "Id"; ni.value = m_id; ni.type = DataType.String; macroNode.AddInfo(ni); ni = new NodeInfo(); ni.name = "Target"; ni.value = m_target; ni.type = DataType.String; macroNode.AddInfo(ni); ni = new NodeInfo(); ni.name = "Type"; ni.value = m_type; ni.type = DataType.String; macroNode.AddInfo(ni); macro.AddChild(macroNode); } Initialize(); ConstructFilter(partData, discoveryText, action); ExecuteFilter(); return m_outStream; }
private void CompareDiscoveryForDoc(string filename, DocumentText discoveredText, DocumentText dtCheck) { foreach (IAbstractTextType ttOrig in dtCheck.GetTextTypes()) { if (IsAllWhiteSpace(ttOrig)) continue; switch (ttOrig.GetContentType()) { case ContentType.Version: case ContentType.RoutingSlip: case ContentType.AutoVersion: break; case ContentType.Reviewer: // don't bother checking this type // a doc that *had* track changes has this type // a docx that *had* track changes doesnt //if (dtCheck.GetTextTypes(ContentType.Version).Count == 0) // CheckTypeWasFound(discoveredText, ttOrig); break; case ContentType.Macro: if (!HadJustThisDocumentMacro(ttOrig)) CheckTypeWasFound(filename, discoveredText, ttOrig); break; case ContentType.TextBox: if (!AreAllTextBoxesPictures(ttOrig)) CheckTypeWasFound(filename, discoveredText, ttOrig); break; default: CheckTypeWasFound(filename, discoveredText, ttOrig); break; } } }
private TextType FindTextType(DocumentText dt) { foreach (TextType tt in dt.GetTextTypes()) { if (tt.GetContentType() == ContentType) return tt; } TextType tt2 = new TextType(ContentType); if (ContentType == ContentType.ContentRule) return tt2; dt.AddTextType(tt2); return tt2; }
private static void CheckOneTypeOfMetadata(ContentType notExpected, DocumentText dt, ContentType type, string sText) { // Word can mark redacted text with "vanish" which may or may not be appropriate. if (notExpected == ContentType.HiddenText && type == ContentType.RedactedText) return; if (type == notExpected) { List<IAbstractTextType> listType = dt.GetTextTypes(type); Assert.IsNotNull(listType, "Failed to clean " + sText); if (listType.Count > 0) { Assert.AreEqual(1, listType.Count, "Will take an empty TextType in the list as OK"); TextType tt = dt.GetTextTypes(type)[0] as TextType; Assert.IsNotNull(tt, "Failed to clean " + sText); Assert.AreEqual(0, tt.GetChildCount(), "Failed to clean " + sText); } } else { TextType tt = dt.GetTextTypes(type)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no " + sText); } }
private static void TestHasAllMetadatTypesOtherThan(DocumentText dt, ContentType notExpected) { Assert.IsNotNull(dt, "expected the document text object to be valid"); Assert.Greater(dt.GetTextTypes().Count, 0, "expected some document text types to have been added"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Comment, "comments"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.TrackChange, "track changes"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.HiddenText, "hidden text"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.SmallText, "small text"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.WhiteText, "white text"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.AttachedTemplate, "attached template"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.SmartTag, "smart tags"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Field, "fields"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Hyperlink, "hyperlinks"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.CustomProperty, "custom properties"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.Macro, "macro"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.RedactedText, "redacted text"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.BuiltInProperty, "built-in properties"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.Variable, "document variables"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.DocumentStatistic, "document Stats"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.Reviewer, "reviewerss"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.Version, "versions"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.AutoVersion, "autoversions"); //CheckOneTypeOfMetadata(notExpected, dt, ContentType.RoutingSlip, "routing slip"); }
private static void CheckOneTypeOfMetadata(ContentType notExpected, DocumentText dt, ContentType type, string sText) { if (type == notExpected) { List<IAbstractTextType> listType = dt.GetTextTypes(type); Assert.IsNotNull(listType, "Failed to clean " + sText); if (listType.Count > 0) { Assert.AreEqual(1, listType.Count, "Will take an empty TextType in the list as OK"); TextType tt = dt.GetTextTypes(type)[0] as TextType; Assert.IsNotNull(tt, "Failed to clean " + sText); Assert.AreEqual(0, tt.GetChildCount(), "Failed to clean " + sText); } } else { TextType tt = dt.GetTextTypes(type)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no " + sText); } }
private static void TestHasAllMetadataTypesOtherThan(DocumentText dt, ContentType notExpected) { Assert.IsNotNull(dt, "expected the document text object to be valid"); Assert.Greater(dt.GetTextTypes().Count, 0, "expected some document text types to have been added"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Header, "headers"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Footer, "footers"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Comment, "comments"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.TrackChange, "track changes"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.SmallText, "small text"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.WhiteText, "white text"); // CheckOneTypeOfMetadata(notExpected, dt, ContentType.SmartTag); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Hyperlink, "hyperlinks"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Links, "links"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.HiddenSheet, "hidden sheet"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.HiddenRow, "hidden row"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.HiddenColumn, "hidden column"); // CheckOneTypeOfMetadata(notExpected, dt, ContentType.CustomProperty, "custom property"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.Macro, "macro"); CheckOneTypeOfMetadata(notExpected, dt, ContentType.RedactedText, "redacted text"); // CheckOneTypeOfMetadata(notExpected, dt, ContentType.HiddenText, "hidden text"); // CheckOneTypeOfMetadata(notExpected, dt, ContentType.BuiltInProperty, "built in property"); }
private static void CheckThatBuiltInTypeOfMetadataWasCleaned(DocumentText dt, ContentType type) { List<IAbstractTextType> ttl = dt.GetTextTypes(type); Assert.IsNotNull(ttl, "Clean documents should have empty lists"); Assert.AreEqual(1, ttl.Count, "Clean documents should have empty lists"); }
private static void TestHasAllMetadatTypesOtherThan(DocumentText dt, ContentType notExpected) { Assert.IsNotNull(dt, "expected the document text object to be valid"); Assert.Greater(dt.GetTextTypes().Count, 0, "expected some document text types to have been added"); TextType tt; if (ContentType.Header == notExpected) { List<IAbstractTextType> listHeaders = dt.GetTextTypes(ContentType.Header); Assert.IsNotNull(listHeaders, "HdrFtrs can have lists but no children in those lists"); if (listHeaders.Count > 0) { Assert.AreEqual(1, listHeaders.Count, "HdrFtrs can have lists but no children in those lists"); TextType ttHeaders = dt.GetTextTypes(ContentType.Header)[0] as TextType; Assert.IsNotNull(ttHeaders); Assert.AreEqual(0, ttHeaders.GetChildCount(), "Clean documents should have nothing discovered"); } } else { tt = dt.GetTextTypes(ContentType.Header)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no headers"); } if (ContentType.Footer == notExpected) { List<IAbstractTextType> listFooters = dt.GetTextTypes(ContentType.Footer); Assert.IsNotNull(listFooters, "HdrFtrs can have lists but no children in those lists"); if (listFooters.Count > 0) { Assert.AreEqual(1, listFooters.Count, "HdrFtrs can have lists but no children in those lists"); TextType ttFooters = dt.GetTextTypes(ContentType.Footer)[0] as TextType; Assert.IsNotNull(ttFooters); Assert.AreEqual(0, ttFooters.GetChildCount(), "Clean documents should have nothing discovered"); } } else { tt = dt.GetTextTypes(ContentType.Footer)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no footers"); } if (ContentType.Comment == notExpected) { CheckThatOneTypeOfMetadataWasCleaned(dt, ContentType.Comment); } else { tt = dt.GetTextTypes(ContentType.Comment)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no comments"); } if (ContentType.Field == notExpected) { CheckThatOneTypeOfMetadataWasCleaned(dt, ContentType.Field); } else { tt = dt.GetTextTypes(ContentType.Field)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no 'fields'"); } if (ContentType.HiddenSlide == notExpected) { CheckThatOneTypeOfMetadataWasCleaned(dt, ContentType.HiddenSlide); } else { tt = dt.GetTextTypes(ContentType.HiddenSlide)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no hidden slides"); } if (ContentType.SpeakerNote == notExpected) { CheckThatOneTypeOfMetadataWasCleaned(dt, ContentType.SpeakerNote); } else { tt = dt.GetTextTypes(ContentType.SpeakerNote)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no speaker notes"); } if (ContentType.CustomProperty == notExpected) { CheckThatOneTypeOfMetadataWasCleaned(dt, ContentType.CustomProperty); } else { tt = dt.GetTextTypes(ContentType.CustomProperty)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no custom properties"); } if (ContentType.Macro == notExpected) { CheckThatOneTypeOfMetadataWasCleaned(dt, ContentType.Macro); } else { tt = dt.GetTextTypes(ContentType.Macro)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no macros"); } if (ContentType.BuiltInProperty == notExpected) { CheckThatBuiltInTypeOfMetadataWasCleaned(dt, ContentType.BuiltInProperty); } else { tt = dt.GetTextTypes(ContentType.BuiltInProperty)[0] as TextType; Assert.Greater(tt.GetChildCount(), 0, "Test document has no built in properties"); } }
public List<DiscoveryDifference> DifferenceTo(DocumentText dtAPI, DocumentText dtBin) { List<DiscoveryDifference> Diffs = new List<DiscoveryDifference>(); foreach (ContentType ttType in SingleFileCleaningTester.GetListOfAllContentTypes()) { if (ttType == ContentType.Paragraph) continue; int iOriginalCount = 0; foreach (IAbstractTextType ttOrig in m_dtMe.GetTextTypes()) { if (ttOrig.GetContentType() == ttType) { iOriginalCount = ttOrig.GetChildCount(); break; } } // Hopefully safe to assume we did not find more in the cleaned document int iACount = 0; foreach (IAbstractTextType ttAPI in dtAPI.GetTextTypes()) { if (ttAPI.GetContentType() == ttType) { iACount = ttAPI.GetChildCount(); break; } } int iBCount = 0; IAbstractTextType ttBinType = null; foreach (IAbstractTextType ttBin in dtBin.GetTextTypes()) { if (ttBin.GetContentType() == ttType) { iBCount = ttBin.GetChildCount(); ttBinType = ttBin; break; } } bool bIsExpected = IsExpectedFromRedaction(ttBinType); DiscoveryDifference Diff = new DiscoveryDifference(ttType, iOriginalCount, iACount, iBCount, bIsExpected); Diffs.Add(Diff); } return Diffs; }
private void CheckCustomPropertiesWereFound(string fileName, DocumentText discoveredText, DocumentText dtCheck) { int iCheck = 0; IAbstractTextType ttCheck = dtCheck.GetTextTypes(ContentType.CustomProperty)[0]; if(ttCheck != null) iCheck = ttCheck.GetChildCount(); if (iCheck == 0) { if (discoveredText.GetTextTypes(ContentType.CustomProperty) == null) return; } int iDisc = 0; TextType ttDiscover = null; if(discoveredText.GetTextTypes(ContentType.CustomProperty) != null && discoveredText.GetTextTypes(ContentType.CustomProperty).Count > 0) ttDiscover = discoveredText.GetTextTypes(ContentType.CustomProperty)[0] as TextType; if(ttDiscover != null) iDisc = ttDiscover.GetChildCount(); if (iCheck == iDisc) return; int iRecountCheck = 0; while (--iCheck >= 0) { string nameCheck = ttCheck.GetChild(iCheck).GetInfo("Name")[0].value; if (!nameCheck.StartsWith("_")) ++iRecountCheck; } int iRecountDisc = 0; while (--iDisc >= 0) { string nameDisc = ttDiscover.GetChild(iDisc).GetInfo("Name")[0].value; if (!nameDisc.StartsWith("_")) ++iRecountDisc; } if(iRecountCheck != iRecountDisc) throw new Exception("Failed to discover custom proprties correctly"); }
private void CheckTypeWasFound(string fileName, DocumentText discoveredText, IAbstractTextType ttOrig) { List<IAbstractTextType> ttList = discoveredText.GetTextTypes(ttOrig.GetContentType()); if (ttList.Count == 0) { DumpFoundText(fileName, ttOrig); m_workItem.Info = DumpTextType(ttOrig); m_workItem.ContentType = ttOrig.GetContentType(); throw new Exception("Failed to discover metadata found in base document"); } }
private void CompareDiscoveryForXls(string filename, DocumentText discoveredText, DocumentText dtCheck) { foreach (IAbstractTextType ttOrig in dtCheck.GetTextTypes()) { if (IsAllWhiteSpace(ttOrig)) continue; switch (ttOrig.GetContentType()) { case ContentType.BuiltInProperty: case ContentType.RoutingSlip: break; default: CheckTypeWasFound(filename, discoveredText, ttOrig); break; } } }