private void GetImagesPpt(OleDocument doc) { using (Stream stmPictures = doc.OpenStream("Pictures")) { if (stmPictures == null) { return; } int ImagesFound = 0; stmPictures.Seek(0, SeekOrigin.Begin); while (stmPictures.Position < stmPictures.Length - 0x19) { stmPictures.Seek(0x4, SeekOrigin.Current); BinaryReader brData = new BinaryReader(stmPictures); UInt32 PICLength = brData.ReadUInt32(); if (PICLength == 0 || stmPictures.Position + PICLength > stmPictures.Length) { break; } byte[] bufferPIC = brData.ReadBytes((int)PICLength); string strImageName = "Image" + ImagesFound++; using (MemoryStream msJPG = new MemoryStream(bufferPIC, 0x11, bufferPIC.Length - 0x11)) { FileMetadata exifMetadata = null; using (EXIFDocument eDoc = new EXIFDocument(msJPG, ".jpg")) { exifMetadata = eDoc.AnalyzeFile(); } if (exifMetadata != null) { foundMetadata.EmbeddedImages.Add(strImageName, exifMetadata); this.foundMetadata.AddRange(exifMetadata.Users.ToArray()); this.foundMetadata.AddRange(exifMetadata.Applications.ToArray()); } } } } }
public static DocumentExtractor Create(string extension, Stream file) { if (file == null) { throw new ArgumentNullException(nameof(file)); } string normalizedExtension = NormalizeExtension(extension); if (IsSupportedExtension(normalizedExtension)) { DocumentExtractor document = null; switch (normalizedExtension) { case ".sxw": case ".odt": case ".ods": case ".odg": case ".odp": document = new OpenOfficeDocument(file, extension); break; case ".docx": case ".xlsx": case ".pptx": case ".ppsx": document = new OfficeOpenXMLDocument(file, extension); break; case ".doc": case ".xls": case ".ppt": case ".pps": document = new Office972003(file); break; case ".pdf": document = new PDFDocument(file); break; case ".wpd": document = new WPDDocument(file); break; case ".raw": case ".cr2": case ".crw": case ".jpg": case ".jpeg": document = new EXIFDocument(file, extension); break; case ".svg": case ".svgz": document = new SVGDocument(file); break; case ".indd": document = new InDDDocument(file); break; case ".rdp": document = new RDPDocument(file); break; case ".ica": document = new ICADocument(file); break; default: throw new ArgumentException("Extension not supported", nameof(extension)); } return(document); } else { throw new ArgumentException("Extension not supported", nameof(extension)); } }
/// <summary> /// Extrae los metadatos del documento /// </summary> public override FileMetadata AnalyzeFile() { try { this.foundMetadata = new FileMetadata(); using (PdfDocument doc = PdfReader.Open(this.fileStream, PdfDocumentOpenMode.InformationOnly)) { int imageNumber = 0; //Read embedded images foreach (PdfDictionary item in doc.Internals.GetAllObjects().Where(p => p is PdfDictionary d && d.Stream != null && "/Image".Equals(d.Elements["/Subtype"]?.ToString()))) { try { using (MemoryStream msJPG = new MemoryStream(item.Stream.Value)) { using (EXIFDocument eDoc = new EXIFDocument(msJPG)) { FileMetadata exifMetadata = eDoc.AnalyzeFile(); //Ignore images which only contain 'Adobe JPEG' makernotes if (exifMetadata != null && exifMetadata.HasMetadata() && !exifMetadata.Makernotes.All(p => p.Key == "Adobe JPEG")) { foundMetadata.EmbeddedImages.Add(imageNumber.ToString(), exifMetadata); imageNumber++; this.foundMetadata.AddRange(exifMetadata.Users.ToArray()); this.foundMetadata.AddRange(exifMetadata.Applications.ToArray()); } } } } catch (Exception) { } } ReadXMPMetadata(doc); if (doc.Info.Title != string.Empty) { this.foundMetadata.Title = Functions.ToPlainText(doc.Info.Title); if (Uri.IsWellFormedUriString(doc.Info.Title, UriKind.Absolute)) { this.foundMetadata.Add(new Diagrams.Path(PathAnalysis.CleanPath(doc.Info.Title), true)); } } if (doc.Info.Subject != string.Empty) { this.foundMetadata.Subject = Functions.ToPlainText(doc.Info.Subject); } if (doc.Info.Author != string.Empty) { this.foundMetadata.Add(new User(Functions.ToPlainText(doc.Info.Author), true)); } if (doc.Info.Keywords != string.Empty) { this.foundMetadata.Keywords = Functions.ToPlainText(doc.Info.Keywords); } if (doc.Info.Creator != string.Empty) { string strSoftware = ApplicationAnalysis.GetApplicationsFromString(Functions.ToPlainText(doc.Info.Creator)); if (strSoftware.Trim() != string.Empty) { this.foundMetadata.Add(new Application(strSoftware)); } //No se ha localizado ninguna aplicación conocida, aun así mostrar la aplicación encontrada else if (!String.IsNullOrWhiteSpace(Functions.ToPlainText(doc.Info.Creator))) { this.foundMetadata.Add(new Application(Functions.ToPlainText(doc.Info.Creator).Trim())); } } if (!String.IsNullOrWhiteSpace(doc.Info.Producer)) { string strSoftware = ApplicationAnalysis.GetApplicationsFromString(Functions.ToPlainText(doc.Info.Producer)); if (!String.IsNullOrWhiteSpace(strSoftware)) { this.foundMetadata.Add(new Application(strSoftware)); } //No se ha localizado ninguna aplicación conocida, aun así mostrar la aplicación encontrada else if (!String.IsNullOrWhiteSpace(Functions.ToPlainText(doc.Info.Producer))) { this.foundMetadata.Add(new Application(Functions.ToPlainText(doc.Info.Producer).Trim())); } } try { if (doc.Info.CreationDate != DateTime.MinValue) { this.foundMetadata.Dates.CreationDate = doc.Info.CreationDate; } } catch (InvalidCastException) { } try { if (doc.Info.ModificationDate != DateTime.MinValue) { this.foundMetadata.Dates.ModificationDate = doc.Info.ModificationDate; } } catch (InvalidCastException) { } } SearchPathsLinksAndEmails(this.fileStream); //Find users in paths foreach (Diagrams.Path path in this.foundMetadata.Paths) { string strUser = PathAnalysis.ExtractUserFromPath(path.Value); this.foundMetadata.Add(new User(strUser, path.IsComputerFolder)); } //Also search software in the title (only pdf). It is added only if the software is known. if (!String.IsNullOrEmpty(foundMetadata.Title)) { string strSoftware = ApplicationAnalysis.GetApplicationsFromString(foundMetadata.Title); if (!String.IsNullOrWhiteSpace(strSoftware)) { this.foundMetadata.Add(new Application(strSoftware)); } } } catch (PdfReaderException) { } catch (Exception ex) { System.Diagnostics.Debug.WriteLine(ex.ToString()); } finally { if (foundMetadata == null) { this.foundMetadata = new FileMetadata(); } if (fileStream != null) { this.fileStream.Dispose(); } } return(this.foundMetadata); }
/// <summary> /// Extrae los metadatos del documento /// </summary> public override FileMetadata AnalyzeFile() { try { this.foundMetadata = new FileMetadata(); using (ZipFile zip = ZipFile.Read(this.fileStream)) { string strFile = "meta.xml"; if (zip.EntryFileNames.Contains(strFile)) { using (Stream stmXML = new MemoryStream()) { zip.Extract(strFile, stmXML); stmXML.Seek(0, SeekOrigin.Begin); AnalizeFileMeta(stmXML); } } strFile = "settings.xml"; if (zip.EntryFileNames.Contains(strFile)) { using (Stream stmXML = new MemoryStream()) { zip.Extract(strFile, stmXML); stmXML.Seek(0, SeekOrigin.Begin); analizeFileSettings(stmXML); } } strFile = "content.xml"; if (zip.EntryFileNames.Contains(strFile)) { using (Stream stmXML = new MemoryStream()) { zip.Extract(strFile, stmXML); stmXML.Seek(0, SeekOrigin.Begin); AnalizeFileContent(stmXML); } } strFile = "VersionList.xml"; if (zip.EntryFileNames.Contains(strFile)) { using (Stream stmXML = new MemoryStream()) { zip.Extract(strFile, stmXML); stmXML.Seek(0, SeekOrigin.Begin); AnalizeFileVersionList(stmXML, zip); } } //Extrae inforamción EXIF de las imágenes embebidas en el documento foreach (string strFileName in zip.EntryFileNames) { string strFileNameLo = strFileName.ToLower(); //Filtro que obtiene las imagenes *.jpg, *.jpeg dentro de la carpeta "Pictures/" if (strFileNameLo.StartsWith("pictures/") && (strFileNameLo.EndsWith(".jpg") || strFileNameLo.EndsWith(".jpeg"))) { using (Stream stmXML = new MemoryStream()) { zip.Extract(strFileName, stmXML); stmXML.Seek(0, SeekOrigin.Begin); using (EXIFDocument eDoc = new EXIFDocument(stmXML, System.IO.Path.GetExtension(strFileNameLo))) { FileMetadata exifMetadata = eDoc.AnalyzeFile(); //Añadimos al diccionario la imagen encontrada junto con la información EXIF de la misma this.foundMetadata.EmbeddedImages.Add(System.IO.Path.GetFileName(strFileName), exifMetadata); //Los usuarios de la información EXIF se añaden a los usuarios del documento this.foundMetadata.AddRange(exifMetadata.Users.ToArray()); this.foundMetadata.AddRange(exifMetadata.Applications.ToArray()); } } } } } //Buscamos usuarios en las rutas del documento foreach (Diagrams.Path ri in this.foundMetadata.Paths) { string strUser = PathAnalysis.ExtractUserFromPath(ri.Value); if (!string.IsNullOrEmpty(strUser)) { this.foundMetadata.Add(new User(strUser, ri.IsComputerFolder, "Path: " + ri.Value)); } } } catch (Exception e) { System.Diagnostics.Debug.WriteLine(String.Format("Error analyzing OpenOffice document ({0})", e.ToString())); } return(this.foundMetadata); }
public override FileMetadata AnalyzeFile() { try { this.foundMetadata = new FileMetadata(); using (Package pZip = Package.Open(this.fileStream)) { Uri uriFile = new Uri("/docProps/core.xml", UriKind.Relative); if (pZip.PartExists(uriFile)) { PackagePart pDocument = pZip.GetPart(uriFile); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeFileCore(stmDoc); } } uriFile = new Uri("/docProps/app.xml", UriKind.Relative); if (pZip.PartExists(uriFile)) { PackagePart pDocument = pZip.GetPart(uriFile); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeFileApp(stmDoc); } } //Control de versiones if (strExtlo == ".docx") { uriFile = new Uri("/word/document.xml", UriKind.Relative); if (pZip.PartExists(uriFile)) { PackagePart pDocument = pZip.GetPart(uriFile); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeFileDocument(stmDoc); } } //Consulta el fichero settings para recuperar el idioma del documento if (foundMetadata.Language == string.Empty) { uriFile = new Uri("/word/settings.xml", UriKind.Relative); if (pZip.PartExists(uriFile)) { PackagePart pDocument = pZip.GetPart(uriFile); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeFileSettings(stmDoc); } } } //Consulta el fichero document.xml.rels para obtener los links del documento uriFile = new Uri("/word/_rels/document.xml.rels", UriKind.Relative); if (pZip.PartExists(uriFile)) { PackagePart pDocument = pZip.GetPart(uriFile); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeLinks(stmDoc); } } } //Obtiene el nombre de las impresoras y los links de los documentos xlsx else if (strExtlo == ".xlsx") { List <Uri> lstFiles = new List <Uri>(); foreach (PackagePart pp in pZip.GetParts()) { if (pp.Uri.ToString().StartsWith("/xl/printerSettings/printerSettings")) { PackagePart pDocument = pZip.GetPart(pp.Uri); if (pDocument != null) { char[] name = new char[32]; using (StreamReader sr = new StreamReader(pDocument.GetStream(FileMode.Open, FileAccess.Read), Encoding.Unicode)) { sr.Read(name, 0, 32); } this.foundMetadata.Add(new Printer(Functions.FilterPrinter((new string(name).Replace("\0", ""))))); } } if (pp.Uri.ToString().StartsWith("/xl/worksheets/_rels/")) { PackagePart pDocument = pZip.GetPart(pp.Uri); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeLinks(stmDoc); } } } } else if (strExtlo == ".pptx") { List <Uri> lstFiles = new List <Uri>(); foreach (PackagePart pp in pZip.GetParts()) { if (pp.Uri.ToString().StartsWith("/ppt/slides/_rels/")) { PackagePart pDocument = pZip.GetPart(pp.Uri); using (Stream stmDoc = pDocument.GetStream(FileMode.Open, FileAccess.Read)) { AnalizeLinks(stmDoc); } } } } //Extraer información EXIF de cada imagen foreach (PackagePart pp in pZip.GetParts()) { string strFileName = pp.Uri.ToString(); string strFileNameLo = strFileName.ToLower(); //Filtro que se queda con todas las imagenes *.jpg y *.jpeg de las 3 posibles carpetas if ((strFileNameLo.StartsWith("/word/media/") || strFileNameLo.StartsWith("/ppt/media/") || strFileNameLo.StartsWith("/xl/media/")) && (strFileNameLo.EndsWith(".jpg") || strFileNameLo.EndsWith(".jpeg"))) { using (EXIFDocument eDoc = new EXIFDocument(pp.GetStream(FileMode.Open, FileAccess.Read), System.IO.Path.GetExtension(strFileNameLo))) { FileMetadata exifMetadata = eDoc.AnalyzeFile(); foundMetadata.EmbeddedImages.Add(System.IO.Path.GetFileName(strFileName), exifMetadata); //Copiamos los metadatos sobre usuarios y Applications de la imagen al documento this.foundMetadata.AddRange(exifMetadata.Users.ToArray()); this.foundMetadata.AddRange(exifMetadata.Applications.ToArray()); } } } } } catch (Exception e) { System.Diagnostics.Debug.WriteLine(e.ToString()); } return(this.foundMetadata); }
private void GetImagesDoc(OleDocument doc) { using (Stream WordDocument = doc.OpenStream("WordDocument")) { using (Stream stmData = doc.OpenStream("Data")) { if (WordDocument == null || stmData == null) { return; } WordDocument.Seek(0x18, SeekOrigin.Begin); BinaryReader br = new BinaryReader(WordDocument); Int32 fcMin = br.ReadInt32(); Int32 fcMac = br.ReadInt32(); Int32 FKPStart = fcMac % 0x200 == 0 ? fcMac : (fcMac - fcMac % 0x200) + 0x200; WordDocument.Seek(FKPStart, SeekOrigin.Begin); int imagesFound = 0; while (WordDocument.Position + 0x200 < WordDocument.Length) { byte[] FKP = br.ReadBytes(0x200); if (FKP[0x1FF] == 00) { break; } foreach (int offset in Functions.SearchBytesInBytes(FKP, new byte[] { 0x03, 0x6A })) { if (offset < 0x200 - 5) { int PICOffset = FKP[offset + 5] * 0x1000000 + FKP[offset + 4] * 0x10000 + FKP[offset + 3] * 0x100 + FKP[offset + 2]; if (PICOffset >= 0 && PICOffset < stmData.Length) { stmData.Seek(PICOffset, SeekOrigin.Begin); BinaryReader brData = new BinaryReader(stmData); UInt32 PICLength = brData.ReadUInt32(); long posOri = stmData.Position; int bufferLen = PICLength < stmData.Length - stmData.Position ? (int)PICLength - 4 : (int)(stmData.Length - stmData.Position); if (bufferLen <= 0) { continue; } byte[] bufferPIC = brData.ReadBytes(bufferLen); string strImageName = String.Empty; using (StreamReader sr = new StreamReader(new MemoryStream(bufferPIC), Encoding.Unicode)) { String sRead = sr.ReadToEnd(); foreach (Match m in Regex.Matches(sRead, @"([a-z]:|\\)\\[a-zá-ú0-9\\\s,;.\-_#\$%&()=ñ´'¨{}Ç`/n/r\[\]+^@]+\\[a-zá-ú0-9\\\s,;.\-_#\$%&()=ñ´'¨{}Ç`/n/r\[\]+^@]+", RegexOptions.IgnoreCase)) { String path = m.Value.Trim(); this.foundMetadata.Add(new Diagrams.Path(PathAnalysis.CleanPath(path), true)); strImageName = System.IO.Path.GetFileName(path); } } if (String.IsNullOrEmpty(strImageName) || foundMetadata.EmbeddedImages.ContainsKey(strImageName)) { strImageName = "Image" + imagesFound++; } List <int> lstJPEG = Functions.SearchBytesInBytes(bufferPIC, new byte[] { 0xFF, 0xD8, 0xFF }); if (lstJPEG.Count > 0) { using (MemoryStream msJPG = new MemoryStream(bufferPIC, lstJPEG[0], bufferPIC.Length - lstJPEG[0])) { using (EXIFDocument eDoc = new EXIFDocument(msJPG, ".jpg")) { FileMetadata exifMetadata = eDoc.AnalyzeFile(); foundMetadata.EmbeddedImages.Add(strImageName, exifMetadata); this.foundMetadata.AddRange(exifMetadata.Users.ToArray()); this.foundMetadata.AddRange(exifMetadata.Applications.ToArray()); } } } } } } } } } }