/// <summary> /// Extracts an Gzip file contained in fileEntry. Since this function is recursive, even though /// Gzip only supports a single compressed file, that inner file could itself contain multiple others. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { GZipArchive?gzipArchive = null; try { gzipArchive = GZipArchive.Open(fileEntry.Content); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.GZIP, fileEntry.FullPath, string.Empty, e.GetType()); } if (gzipArchive != null) { foreach (var entry in gzipArchive.Entries) { if (entry.IsDirectory) { continue; } governor.CheckResourceGovernor(entry.Size); var newFilename = Path.GetFileNameWithoutExtension(fileEntry.Name); if (fileEntry.Name.EndsWith(".tgz", StringComparison.InvariantCultureIgnoreCase)) { newFilename = newFilename[0..^ 4] + ".tar";
/// <summary> /// Extracts a 7-Zip file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { var sevenZipArchive = GetSevenZipArchive(fileEntry, options); if (sevenZipArchive != null) { var entries = sevenZipArchive.Entries.Where(x => !x.IsDirectory && x.IsComplete).ToList(); foreach (var entry in entries) { governor.CheckResourceGovernor(entry.Size); var name = entry.Key.Replace('/', Path.DirectorySeparatorChar); var newFileEntry = await FileEntry.FromStreamAsync(name, entry.OpenEntryStream(), fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); throw new OverflowException(); } await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor)) { yield return(extractedFile); } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Extracts an a VHD file /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { using var disk = new DiscUtils.Vhd.Disk(fileEntry.Content, Ownership.None); LogicalVolumeInfo[]? logicalVolumes = null; try { var manager = new VolumeManager(disk); logicalVolumes = manager.GetLogicalVolumes(); } catch (Exception e) { Logger.Debug("Error reading {0} disk at {1} ({2}:{3})", disk.GetType(), fileEntry.FullPath, e.GetType(), e.Message); } if (logicalVolumes != null) { foreach (var volume in logicalVolumes) { foreach (var entry in DiscCommon.DumpLogicalVolume(volume, fileEntry.FullPath, options, governor, Context, fileEntry)) { yield return(entry); } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Extracts an zip file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { ZipFile?zipFile = null; try { zipFile = new ZipFile(fileEntry.Content); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.ZIP, fileEntry.FullPath, string.Empty, e.GetType()); } if (zipFile != null) { var buffer = new byte[BUFFER_SIZE]; var passwordFound = false; foreach (ZipEntry?zipEntry in zipFile) { if (zipEntry is null || zipEntry.IsDirectory || !zipEntry.CanDecompress) { continue; } if (zipEntry.IsCrypted && !passwordFound) { zipFile.Password = GetZipPassword(fileEntry, zipFile, zipEntry, options) ?? string.Empty; passwordFound = true; } governor.CheckResourceGovernor(zipEntry.Size); using var fs = new FileStream(Path.GetTempFileName(), FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite, 4096, FileOptions.DeleteOnClose); try { var zipStream = zipFile.GetInputStream(zipEntry); StreamUtils.Copy(zipStream, fs, buffer); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.ZIP, fileEntry.FullPath, zipEntry.Name, e.GetType()); } var name = zipEntry.Name.Replace('/', Path.DirectorySeparatorChar); var newFileEntry = new FileEntry(name, fs, fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); throw new OverflowException(); } await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor)) { yield return(extractedFile); } } } }
/// <summary> /// Extracts an archive file created with GNU ar /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> /// public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { IEnumerable <FileEntry>?fileEntries = null; try { fileEntries = ArFile.GetFileEntries(fileEntry, options, governor); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.AR, fileEntry.FullPath, string.Empty, e.GetType()); if (e is OverflowException) { throw; } } if (fileEntries != null) { if (options.Parallel) { while (fileEntries.Any()) { var tempStore = new ConcurrentStack <FileEntry>(); var selectedEntries = fileEntries.Take(options.BatchSize); selectedEntries.AsParallel().ForAll(arEntry => { tempStore.PushRange(Context.ExtractFile(arEntry, options, governor).ToArray()); }); fileEntries = fileEntries.Skip(selectedEntries.Count()); while (tempStore.TryPop(out var result)) { if (result != null) { yield return(result); } } } } else { foreach (var entry in fileEntries) { foreach (var extractedFile in Context.ExtractFile(entry, options, governor)) { yield return(extractedFile); } } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Automatically check compatibility with multiple (currently one, but there will be multiple) extractors; /// Return null if there is not a suitable extractor. /// Otherwise return a Extractor with option passed. /// </summary> public static Extractor findExtractor(ExtractorOptions option = null) { if (Tar.Check_compatibility(option)) { return(new Tar(option)); } return(null); }
/// <summary> /// Extracts an a Tar archive /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { TarEntry tarEntry; TarInputStream?tarStream = null; try { tarStream = new TarInputStream(fileEntry.Content); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.TAR, fileEntry.FullPath, string.Empty, e.GetType()); } if (tarStream != null) { while ((tarEntry = tarStream.GetNextEntry()) != null) { if (tarEntry.IsDirectory) { continue; } var fs = new FileStream(Path.GetTempFileName(), FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite, 4096, FileOptions.DeleteOnClose); governor.CheckResourceGovernor(tarStream.Length); try { tarStream.CopyEntryContents(fs); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.TAR, fileEntry.FullPath, tarEntry.Name, e.GetType()); } var name = tarEntry.Name.Replace('/', Path.DirectorySeparatorChar); var newFileEntry = new FileEntry(name, fs, fileEntry, true); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); throw new OverflowException(); } await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor)) { yield return(extractedFile); } } tarStream.Dispose(); } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
private RarArchive?GetRarArchive(FileEntry fileEntry, ExtractorOptions options) { RarArchive?rarArchive = null; try { rarArchive = RarArchive.Open(fileEntry.Content); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, string.Empty, e.GetType()); } var needsPassword = false; try { using var testStream = rarArchive.Entries.First().OpenEntryStream(); } catch (Exception e) { needsPassword = true; } if (needsPassword is true) { var passwordFound = false; foreach (var passwords in options.Passwords.Where(x => x.Key.IsMatch(fileEntry.Name))) { if (passwordFound) { break; } foreach (var password in passwords.Value) { try { fileEntry.Content.Position = 0; rarArchive = RarArchive.Open(fileEntry.Content, new SharpCompress.Readers.ReaderOptions() { Password = password, LookForHeader = true }); var count = 0; //To do something in the loop foreach (var entry in rarArchive.Entries) { //Just do anything in the loop, but you need to loop over entries to check if the password is correct count++; } break; } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, string.Empty, e.GetType()); } } } } return(rarArchive); }
/// <summary> /// Extracts an archive file created with GNU ar /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> /// public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { await foreach (var entry in ArFile.GetFileEntriesAsync(fileEntry, options, governor)) { await foreach (var extractedFile in Context.ExtractAsync(entry, options, governor)) { yield return(extractedFile); } } }
public Extractor(ExtractorOptions options, Func <IDocument, ExtractResult> transform = null) { _options = options; transform = transform ?? Extract; _inner = new TransformBlock <IDocument, ExtractResult>(transform, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = -1 }); }
public Extractor(ExtractorOptions options = null) { if (options == null) { this.options = new ExtractorOptions(); } else { this.options = options; } }
private SevenZipArchive?GetSevenZipArchive(FileEntry fileEntry, ExtractorOptions options) { SevenZipArchive?sevenZipArchive = null; try { sevenZipArchive = SevenZipArchive.Open(fileEntry.Content); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, string.Empty, e.GetType()); } var needsPassword = false; try { needsPassword = sevenZipArchive?.TotalUncompressSize == 0; } catch (Exception) { needsPassword = true; } if (needsPassword is true) { var passwordFound = false; foreach (var passwords in options.Passwords.Where(x => x.Key.IsMatch(fileEntry.Name))) { if (passwordFound) { break; } foreach (var password in passwords.Value) { try { sevenZipArchive = SevenZipArchive.Open(fileEntry.Content, new SharpCompress.Readers.ReaderOptions() { Password = password }); if (sevenZipArchive.TotalUncompressSize > 0) { passwordFound = true; break; } } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, string.Empty, e.GetType()); } } } } return(sevenZipArchive); }
/// <summary> /// Extracts an an ISO file /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { using var cd = new CDReader(fileEntry.Content, true); var entries = cd.Root.GetFiles("*.*", SearchOption.AllDirectories); if (entries != null) { if (options.Parallel) { var files = new ConcurrentStack <FileEntry>(); var batchSize = Math.Min(options.BatchSize, entries.Length); var selectedFileEntries = entries[0..batchSize];
/// <summary> /// Extracts a WIM file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { DiscUtils.Wim.WimFile?baseFile = null; try { baseFile = new DiscUtils.Wim.WimFile(fileEntry.Content); } catch (Exception e) { Logger.Debug(e, "Failed to init WIM image."); } if (baseFile != null) { for (var i = 0; i < baseFile.ImageCount; i++) { var image = baseFile.GetImage(i); foreach (var file in image.GetFiles(image.Root.FullName, "*.*", SearchOption.AllDirectories)) { Stream?stream = null; try { var info = image.GetFileInfo(file); stream = info.OpenRead(); governor.CheckResourceGovernor(info.Length); } catch (Exception e) { Logger.Debug("Error reading {0} from WIM {1} ({2}:{3})", file, image.FriendlyName, e.GetType(), e.Message); } if (stream != null) { var name = file.Replace('\\', Path.DirectorySeparatorChar); var newFileEntry = new FileEntry($"{image.FriendlyName}{Path.DirectorySeparatorChar}{name}", stream, fileEntry); foreach (var entry in Context.Extract(newFileEntry, options, governor)) { yield return(entry); } stream.Dispose(); } } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Extracts an zip file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { XZStream?xzStream = null; try { xzStream = new XZStream(fileEntry.Content); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.XZ, fileEntry.FullPath, string.Empty, e.GetType()); } if (xzStream != null) { var newFilename = Path.GetFileNameWithoutExtension(fileEntry.Name); var newFileEntry = new FileEntry(newFilename, xzStream, fileEntry); // SharpCompress does not expose metadata without a full read, so we need to decompress first, // and then abort if the bytes exceeded the governor's capacity. var streamLength = xzStream.Index?.Records?.Select(r => r.UncompressedSize) .Aggregate((ulong?)0, (a, b) => a + b); // BUG: Technically, we're casting a ulong to a long, but we don't expect 9 exabyte steams, so // low risk. if (streamLength.HasValue) { governor.CheckResourceGovernor((long)streamLength.Value); } if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); throw new OverflowException(); } foreach (var extractedFile in Context.Extract(newFileEntry, options, governor)) { yield return(extractedFile); } xzStream.Dispose(); } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
public static int ExtractCommand(ExtractCommandOptions options) { var config = new LoggingConfiguration(); var consoleTarget = new ConsoleTarget { Name = "console", Layout = "${longdate}|${level:uppercase=true}|${logger}|${message}", }; if (options.Verbose) { config.AddRule(LogLevel.Trace, LogLevel.Fatal, consoleTarget, "*"); } else if (options.Debug) { config.AddRule(LogLevel.Debug, LogLevel.Fatal, consoleTarget, "*"); } else { config.AddRule(LogLevel.Info, LogLevel.Fatal, consoleTarget, "*"); } LogManager.Configuration = config; var extractor = new Extractor(); var extractorOptions = new ExtractorOptions() { ExtractSelfOnFail = true, Parallel = true, RawExtensions = options.RawExtensions }; if (options.Passwords?.Any() ?? false) { extractorOptions.Passwords = new Dictionary <Regex, List <string> >() { { new Regex(".*", RegexOptions.Compiled), options.Passwords.ToList() } }; } var allowRegexes = options.AllowFilters?.Select(x => new Regex(x)) ?? Array.Empty <Regex>(); var denyRegexes = options.DenyFilters?.Select(x => new Regex(x)) ?? Array.Empty <Regex>(); extractor.ExtractToDirectory(options.Output, options.Input, extractorOptions, allowRegexes, denyRegexes, options.PrintNames); return(0); }
private void ParseFile(string path) { Log.Verbose("Started parsing {0}", path); FileSystemObject obj = FilePathToFileSystemObject(path); if (obj != null) { HandleChange(obj); // If we know how to handle this as an archive, and crawling archives is enabled if (opts.CrawlArchives && MiniMagic.DetectFileType(path) != ArchiveFileType.UNKNOWN) { var opts = new ExtractorOptions() { ExtractSelfOnFail = false }; Extractor extractor = new Extractor(); foreach (var fso in extractor.ExtractFile(path, opts).Select(fileEntry => FileEntryToFileSystemObject(fileEntry))) { HandleChange(fso); } } // TODO: Also try parse .DER as a key if (path.EndsWith(".cer", StringComparison.CurrentCulture) || path.EndsWith(".der", StringComparison.CurrentCulture) || path.EndsWith(".p7b", StringComparison.CurrentCulture) || path.EndsWith(".pfx", StringComparison.CurrentCulture)) { try { using var certificate = new X509Certificate2(path); var certObj = new CertificateObject( StoreLocation: StoreLocation.LocalMachine.ToString(), StoreName: StoreName.Root.ToString(), Certificate: new SerializableCertificate(certificate)); HandleChange(certObj); } catch (Exception e) { Log.Verbose("Could not parse certificate from file: {0} ({1}:{2})", path, e.GetType(), e.Message); } } } Log.Verbose("Finished parsing {0}", path); }
public Tar(ExtractorOptions options) : base(options) { if (new FileInfo(options.archive_filepath).Length < min_tar_size) { throw new NotSupportedException("The provided tar file is invalid"); } filepath = options.archive_filepath; if (options.statemap_filepath == "") { options.statemap_filepath = filepath + ".statemap"; } statemapPath = options.statemap_filepath; states = new Dictionary <string, TarState>(); folderList = new Dictionary <string, VFolder>(); folderList.Add("/", VFolder.RootFolder); fileList = new Dictionary <string, VFile>(); }
/// Extracts an BZip2 file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { BZip2Stream?bzip2Stream = null; try { bzip2Stream = new BZip2Stream(fileEntry.Content, SharpCompress.Compressors.CompressionMode.Decompress, false); governor.CheckResourceGovernor(bzip2Stream.Length); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.BZIP2, fileEntry.FullPath, string.Empty, e.GetType()); } if (bzip2Stream != null) { var newFilename = Path.GetFileNameWithoutExtension(fileEntry.Name); var entryStream = bzip2Stream.Length > options.MemoryStreamCutoff ? new FileStream(Path.GetTempFileName(), FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite, 4096, FileOptions.DeleteOnClose) : (Stream) new MemoryStream((int)bzip2Stream.Length); var newFileEntry = new FileEntry(newFilename, bzip2Stream, fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); bzip2Stream.Dispose(); throw new OverflowException(); } foreach (var extractedFile in Context.Extract(newFileEntry, options, governor)) { yield return(extractedFile); } bzip2Stream.Dispose(); } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Extracts an an ISO file /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { using var cd = new CDReader(fileEntry.Content, true); var entries = cd.Root.GetFiles("*.*", SearchOption.AllDirectories); if (entries != null) { foreach (var file in entries) { var fileInfo = file; governor.CheckResourceGovernor(fileInfo.Length); Stream?stream = null; try { stream = fileInfo.OpenRead(); } catch (Exception e) { Logger.Debug("Failed to extract {0} from ISO {1}. ({2}:{3})", fileInfo.Name, fileEntry.FullPath, e.GetType(), e.Message); } if (stream != null) { var name = fileInfo.Name.Replace('/', Path.DirectorySeparatorChar); var newFileEntry = await FileEntry.FromStreamAsync(name, stream, fileEntry); var innerEntries = Context.ExtractAsync(newFileEntry, options, governor); await foreach (var entry in innerEntries) { yield return(entry); } } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Extracts an BZip2 file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { BZip2Stream?bzip2Stream = null; try { bzip2Stream = new BZip2Stream(fileEntry.Content, SharpCompress.Compressors.CompressionMode.Decompress, false); governor.CheckResourceGovernor(bzip2Stream.Length); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.BZIP2, fileEntry.FullPath, string.Empty, e.GetType()); } if (bzip2Stream != null) { var newFilename = Path.GetFileNameWithoutExtension(fileEntry.Name); var newFileEntry = await FileEntry.FromStreamAsync(newFilename, bzip2Stream, fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); bzip2Stream.Dispose(); throw new OverflowException(); } await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor)) { yield return(extractedFile); } bzip2Stream.Dispose(); } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Reads the sub resource and saves it to the given <paramref name="outputFolder"/> /// </summary> /// <param name="resources">The sub resource</param> /// <param name="outputFolder">The output folder where to save the resource</param> /// <param name="mainUri">The main uri of the web page</param> /// <param name="options"><see cref="ExtractorOptions"/></param> /// <param name="webPage">The main web page</param> private void ProcessSubResources( IDictionary resources, string outputFolder, Uri mainUri, ExtractorOptions options, ref string webPage) { Uri uri = null; byte[] data = null; string mimeType = null; foreach (DictionaryEntry resource in resources) { switch (resource.Key) { case WebResourceUrl: uri = new Uri((string)resource.Value); break; case WebResourceData: data = (byte[])resource.Value; break; case WebResourceMimeType: mimeType = (string)resource.Value; break; } } if ((mimeType == "text/javascript" || mimeType == "application/javascript" || mimeType == "application/x-javascript") && options == ExtractorOptions.IgnoreJavaScriptFiles) { Logger.WriteToLog("Ignoring javascript file, replacing it with a empty string in the web page"); ReplaceWebPageUrl(uri, mainUri, string.Empty, ref webPage); return; } if (data != null && uri != null && uri.LocalPath.StartsWith("/")) { var fileRelativeUri = uri.LocalPath.Replace(mainUri.AbsolutePath, string.Empty).TrimStart('/'); var path = Path.Combine(outputFolder, fileRelativeUri); var fileInfo = new FileInfo(path); if (fileInfo.Exists || File.Exists(fileInfo.DirectoryName) || Directory.Exists(fileInfo.FullName)) { path = Path.Combine(outputFolder, Guid.NewGuid().ToString()); } fileInfo = new FileInfo(path); if (!fileInfo.FullName.EndsWith(@"\")) { fileInfo.Directory?.Create(); File.WriteAllBytes(fileInfo.FullName, data); ReplaceWebPageUrl(uri, mainUri, fileRelativeUri, ref webPage); } else { Logger.WriteToLog($"Ignoring url '{uri}'"); } } }
protected Crawler(IDocumentFactory documentFactory, IKeyValueStore <string, Result> store, IKeyValueStore <string, FetchTarget> frontier) { _store = store; _frontier = frontier; var fetcherOptions = new FetcherOptions { UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", }; var parserOptions = new ParserOptions { }; var scraperOptions = new ScraperOptions { }; var extractorOptions = new ExtractorOptions { }; //var storerOptions = new StorerOptions //{ //}; var builderOptions = new BuilderOptions { }; var providerOptions = new ProviderOptions { }; //var dispatcherOptions = new DispatcherOptions //{ //}; Fetcher = new Fetcher(fetcherOptions); Parser = new Parser(parserOptions, documentFactory); Scraper = new Scraper(scraperOptions); Extractor = new Extractor(extractorOptions); Storer = new Storer(store); Builder = new Builder(builderOptions); Provider = new Provider(providerOptions, store, frontier); Dispatcher = new Dispatcher(); Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK); Parser.SendTo(Scraper); Parser.SendTo(Extractor); Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK); Scraper.SendTo(Builder); Extractor.SendTo(Builder); Builder.SendTo(Storer); //Storer.LinkTo(new ActionBlock<Result>(x => //{ //})); Builder.SendTo(Provider); Provider.SendTo(Dispatcher, x => x != null); Dispatcher.SendTo(Fetcher); }
/// <summary> /// Extracts an a RAR archive /// </summary> /// <param name="fileEntry"> </param> /// <returns> </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { var rarArchive = GetRarArchive(fileEntry, options); if (rarArchive != null) { var entries = rarArchive.Entries.Where(x => x.IsComplete && !x.IsDirectory); if (options.Parallel) { var files = new ConcurrentStack <FileEntry>(); while (entries.Any()) { var batchSize = Math.Min(options.BatchSize, entries.Count()); var streams = entries.Take(batchSize).Select(entry => (entry, entry.OpenEntryStream())).ToList(); governor.CheckResourceGovernor(streams.Sum(x => x.Item2.Length)); streams.AsParallel().ForAll(streampair => { try { var newFileEntry = new FileEntry(streampair.entry.Key, streampair.Item2, fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); governor.CurrentOperationProcessedBytesLeft = -1; } else { files.PushRange(Context.Extract(newFileEntry, options, governor).ToArray()); } } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, streampair.entry.Key, e.GetType()); } }); governor.CheckResourceGovernor(0); entries = entries.Skip(streams.Count); while (files.TryPop(out var result)) { if (result != null) { yield return(result); } } } } else { foreach (var entry in entries) { governor.CheckResourceGovernor(entry.Size); FileEntry?newFileEntry = null; try { var name = entry.Key.Replace('/', Path.DirectorySeparatorChar); newFileEntry = new FileEntry(name, entry.OpenEntryStream(), fileEntry); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, entry.Key, e.GetType()); } if (newFileEntry != null) { if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); throw new OverflowException(); } foreach (var extractedFile in Context.Extract(newFileEntry, options, governor)) { yield return(extractedFile); } } } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Dump the FileEntries from a Logical Volume /// </summary> /// <param name="volume">The Volume to dump</param> /// <param name="parentPath">The Path to the parent Disc</param> /// <param name="options">Extractor Options to use</param> /// <param name="governor">Resource Governor to use</param> /// <param name="Context">Extractor context to use</param> /// <param name="parent">The Parent FilEntry</param> /// <returns></returns> public static IEnumerable <FileEntry> DumpLogicalVolume(LogicalVolumeInfo volume, string parentPath, ExtractorOptions options, ResourceGovernor governor, Extractor Context, FileEntry?parent = null) { DiscUtils.FileSystemInfo[]? fsInfos = null; try { fsInfos = FileSystemManager.DetectFileSystems(volume); } catch (Exception e) { Logger.Debug("Failed to get file systems from logical volume {0} Image {1} ({2}:{3})", volume.Identity, parentPath, e.GetType(), e.Message); } foreach (var fsInfo in fsInfos ?? Array.Empty <DiscUtils.FileSystemInfo>()) { using var fs = fsInfo.Open(volume); var diskFiles = fs.GetFiles(fs.Root.FullName, "*.*", SearchOption.AllDirectories).ToList(); if (options.Parallel) { var files = new ConcurrentStack <FileEntry>(); while (diskFiles.Any()) { var batchSize = Math.Min(options.BatchSize, diskFiles.Count); var range = diskFiles.GetRange(0, batchSize); var fileinfos = new List <(DiscFileInfo, Stream)>(); long totalLength = 0; foreach (var r in range) { try { var fi = fs.GetFileInfo(r); totalLength += fi.Length; fileinfos.Add((fi, fi.OpenRead())); } catch (Exception e) { Logger.Debug("Failed to get FileInfo from {0} in Volume {1} @ {2} ({3}:{4})", r, volume.Identity, parentPath, e.GetType(), e.Message); } } governor.CheckResourceGovernor(totalLength); fileinfos.AsParallel().ForAll(file => { if (file.Item2 != null) { var newFileEntry = new FileEntry($"{volume.Identity}{Path.DirectorySeparatorChar}{file.Item1.FullName}", file.Item2, parent); var entries = Context.ExtractFile(newFileEntry, options, governor); files.PushRange(entries.ToArray()); } }); diskFiles.RemoveRange(0, batchSize); while (files.TryPop(out var result)) { if (result != null) { yield return(result); } } } } else { foreach (var file in diskFiles) { Stream?fileStream = null; try { var fi = fs.GetFileInfo(file); governor.CheckResourceGovernor(fi.Length); fileStream = fi.OpenRead(); } catch (Exception e) { Logger.Debug(e, "Failed to open {0} in volume {1}", file, volume.Identity); } if (fileStream != null) { var newFileEntry = new FileEntry($"{volume.Identity}{Path.DirectorySeparatorChar}{file}", fileStream, parent); var entries = Context.ExtractFile(newFileEntry, options, governor); foreach (var entry in entries) { yield return(entry); } } } } } }
/// <summary> /// Dump the FileEntries from a Logical Volume asynchronously /// </summary> /// <param name="volume">The Volume to dump</param> /// <param name="parentPath">The Path to the parent Disc</param> /// <param name="options">Extractor Options to use</param> /// <param name="governor">Resource Governor to use</param> /// <param name="Context">Extractor context to use</param> /// <param name="parent">The Parent FilEntry</param> /// <returns></returns> public static async IAsyncEnumerable <FileEntry> DumpLogicalVolumeAsync(LogicalVolumeInfo volume, string parentPath, ExtractorOptions options, ResourceGovernor governor, Extractor Context, FileEntry?parent = null) { DiscUtils.FileSystemInfo[]? fsInfos = null; try { fsInfos = FileSystemManager.DetectFileSystems(volume); } catch (Exception e) { Logger.Debug("Failed to get file systems from logical volume {0} Image {1} ({2}:{3})", volume.Identity, parentPath, e.GetType(), e.Message); } foreach (var fsInfo in fsInfos ?? Array.Empty <DiscUtils.FileSystemInfo>()) { using var fs = fsInfo.Open(volume); var diskFiles = fs.GetFiles(fs.Root.FullName, "*.*", SearchOption.AllDirectories).ToList(); foreach (var file in diskFiles) { Stream? fileStream = null; DiscFileInfo?fi = null; try { fi = fs.GetFileInfo(file); governor.CheckResourceGovernor(fi.Length); fileStream = fi.OpenRead(); } catch (Exception e) { Logger.Debug(e, "Failed to open {0} in volume {1}", file, volume.Identity); } if (fileStream != null && fi != null) { var newFileEntry = await FileEntry.FromStreamAsync($"{volume.Identity}{Path.DirectorySeparatorChar}{fi.FullName}", fileStream, parent); var entries = Context.ExtractAsync(newFileEntry, options, governor); await foreach (var entry in entries) { yield return(entry); } } } } }
protected Crawler(IDocumentFactory documentFactory, IKeyValueStore<string, Result> store, IKeyValueStore<string, FetchTarget> frontier) { _store = store; _frontier = frontier; var fetcherOptions = new FetcherOptions { UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", }; var parserOptions = new ParserOptions { }; var scraperOptions = new ScraperOptions { }; var extractorOptions = new ExtractorOptions { }; //var storerOptions = new StorerOptions //{ //}; var builderOptions = new BuilderOptions { }; var providerOptions = new ProviderOptions { }; //var dispatcherOptions = new DispatcherOptions //{ //}; Fetcher = new Fetcher(fetcherOptions); Parser = new Parser(parserOptions, documentFactory); Scraper = new Scraper(scraperOptions); Extractor = new Extractor(extractorOptions); Storer = new Storer(store); Builder = new Builder(builderOptions); Provider = new Provider(providerOptions, store, frontier); Dispatcher = new Dispatcher(); Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK); Parser.SendTo(Scraper); Parser.SendTo(Extractor); Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK); Scraper.SendTo(Builder); Extractor.SendTo(Builder); Builder.SendTo(Storer); //Storer.LinkTo(new ActionBlock<Result>(x => //{ //})); Builder.SendTo(Provider); Provider.SendTo(Dispatcher, x => x != null); Dispatcher.SendTo(Fetcher); }
/// <summary> /// Extracts a 7-Zip file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { var sevenZipArchive = GetSevenZipArchive(fileEntry, options); if (sevenZipArchive != null) { var entries = sevenZipArchive.Entries.Where(x => !x.IsDirectory && x.IsComplete).ToList(); if (options.Parallel) { var files = new ConcurrentStack <FileEntry>(); while (entries.Count() > 0) { var batchSize = Math.Min(options.BatchSize, entries.Count()); var selectedEntries = entries.GetRange(0, batchSize).Select(entry => (entry, entry.OpenEntryStream())); governor.CheckResourceGovernor(selectedEntries.Sum(x => x.entry.Size)); try { selectedEntries.AsParallel().ForAll(entry => { try { var name = entry.entry.Key.Replace('/', Path.DirectorySeparatorChar); var newFileEntry = new FileEntry(name, entry.Item2, fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); governor.CurrentOperationProcessedBytesLeft = -1; } else { files.PushRange(Context.Extract(newFileEntry, options, governor).ToArray()); } } catch (Exception e) when(e is OverflowException) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, entry.entry.Key, e.GetType()); throw; } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, entry.entry.Key, e.GetType()); } }); } catch (Exception e) when(e is AggregateException) { if (e.InnerException?.GetType() == typeof(OverflowException)) { throw e.InnerException; } throw; } governor.CheckResourceGovernor(0); entries.RemoveRange(0, batchSize); while (files.TryPop(out var result)) { if (result != null) { yield return(result); } } } } else { foreach (var entry in entries) { governor.CheckResourceGovernor(entry.Size); var name = entry.Key.Replace('/', Path.DirectorySeparatorChar); var newFileEntry = new FileEntry(name, entry.OpenEntryStream(), fileEntry); if (Extractor.IsQuine(newFileEntry)) { Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath); throw new OverflowException(); } foreach (var extractedFile in Context.Extract(newFileEntry, options, governor)) { yield return(extractedFile); } } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
/// <summary> /// Extract the given <paramref name="inputFile"/> to the given <paramref name="outputFolder"/> /// </summary> /// <param name="inputFile">The input file</param> /// <param name="outputFolder">The folder where to save the extracted web archive</param> /// <param name="options"><see cref="ExtractorOptions"/></param> /// <param name="logStream">When set then logging is written to this stream</param> /// <returns></returns> /// <exception cref="WAEInvalidFile">Raised when a required resource is not found in the web archive</exception> /// <exception cref="WAEResourceMissing">Raised when a required resource is not found in the web archive</exception> /// <exception cref="FileNotFoundException">Raised when the <paramref name="inputFile"/> is not found</exception> /// <exception cref="DirectoryNotFoundException">Raised when the <paramref name="outputFolder"/> does not exist</exception> public string Extract(string inputFile, string outputFolder, ExtractorOptions options = ExtractorOptions.None, Stream logStream = null) { if (logStream != null) { Logger.LogStream = logStream; } try { if (!Directory.Exists(outputFolder)) { throw new DirectoryNotFoundException($"The output folder '{outputFolder}' does not exist"); } var reader = new PList.BinaryPlistReader(); IDictionary archive; try { archive = reader.ReadObject(inputFile); } catch (Exception exception) { throw new WAEInvalidFile($"The file '{inputFile}' is not a valid Safari web archive", exception); } if (!archive.Contains(WebMainResource)) { var message = $"Can't find the resource '{WebMainResource}' in the webarchive"; Logger.WriteToLog(message); throw new WAEResourceMissing(message); } var mainResource = (IDictionary)archive[WebMainResource]; var webPageFileName = Path.Combine(outputFolder, "webpage.html"); Logger.WriteToLog($"Getting main web page from '{WebMainResource}'"); var webPage = ProcessMainResource(mainResource, out var mainUri); #if (DEBUG) File.WriteAllText(webPageFileName, webPage); #endif if (!archive.Contains(WebSubresources)) { Logger.WriteToLog("Web archive does not contain any sub resources"); } else { var subResources = (object[])archive[WebSubresources]; var count = subResources.Length; Logger.WriteToLog($"Web archive has {count} sub resource{(count > 1 ? "s" : string.Empty)}"); foreach (IDictionary subResource in subResources) { ProcessSubResources(subResource, outputFolder, mainUri, options, ref webPage); } } if (!archive.Contains(WebSubframeArchives)) { Logger.WriteToLog("Web archive does not contain any sub frame archives"); } else { var subFrameResources = (object[])archive[WebSubframeArchives]; var subFrameResourcesCount = subFrameResources.Length; Logger.WriteToLog($"Web archive has {subFrameResourcesCount} sub frame resource{(subFrameResourcesCount > 1 ? "s" : string.Empty)}"); var i = 1; foreach (IDictionary subFrameResource in subFrameResources) { var subFrameMainResource = (IDictionary)subFrameResource[WebMainResource]; Logger.WriteToLog($"Getting web page from sub frame resource '{WebMainResource}'"); var subFrameResourceWebPage = ProcessSubFrameMainResource(subFrameMainResource, out var frameName, out var subFrameMainUri); var subFrameOutputFolder = Path.Combine(outputFolder, $"subframe_{i}"); Logger.WriteToLog($"Creating folder '{subFrameOutputFolder}' for iframe '{frameName}' content"); Directory.CreateDirectory(subFrameOutputFolder); i += 1; var subFrameSubResources = (object[])subFrameResource[WebSubresources]; if (subFrameSubResources == null) { Logger.WriteToLog("Web archive sub frame does not contain any sub resources"); } else { var subFrameSubResourcesCount = subFrameSubResources.Length; Logger.WriteToLog($"Web archive sub frame has {subFrameSubResourcesCount} sub resource{(subFrameSubResourcesCount > 1 ? "s" : string.Empty)}"); foreach (IDictionary subFrameSubResource in subFrameSubResources) { ProcessSubResources(subFrameSubResource, subFrameOutputFolder, subFrameMainUri, options, ref subFrameResourceWebPage); } } var subFrameWebPageFileName = Path.Combine(subFrameOutputFolder, "webpage.html"); var subFrameWebPageRelativeUri = $"subframe_{i}/webpage.html"; var subFrameUri = subFrameMainUri.ToString(); var subFrameUriWithoutScheme = subFrameUri.Replace($"{subFrameMainUri.Scheme}:", string.Empty); var subFrameUriWithoutMainUri = subFrameUri.Replace($"{mainUri.Scheme}://{mainUri.Host}{mainUri.AbsolutePath}", string.Empty); if (webPage.Contains(subFrameUri)) { Logger.WriteToLog($"Replacing '{subFrameUri}' with '{subFrameWebPageRelativeUri}'"); webPage = webPage.Replace(subFrameUri, subFrameWebPageRelativeUri); } else if (webPage.Contains(subFrameUriWithoutScheme)) { Logger.WriteToLog($"Replacing '{subFrameUriWithoutScheme}' with '{subFrameWebPageRelativeUri}'"); webPage = webPage.Replace(subFrameUriWithoutScheme, $"{subFrameWebPageRelativeUri}"); } else if (webPage.Contains(subFrameUriWithoutMainUri)) { Logger.WriteToLog($"Replacing '{subFrameUriWithoutMainUri}' with '{subFrameWebPageRelativeUri}'"); webPage = webPage.Replace(subFrameUriWithoutMainUri, $"{subFrameWebPageRelativeUri}"); } else { Logger.WriteToLog($"Could not find any resources with url '{subFrameUri}' in the web page"); } File.WriteAllText(subFrameWebPageFileName, subFrameResourceWebPage); } } File.WriteAllText(webPageFileName, webPage); return(webPageFileName); } catch (Exception exception) { Logger.WriteToLog(ExceptionHelpers.GetInnerException(exception)); throw; } }
/// <summary> /// Extracts a WIM file contained in fileEntry. /// </summary> /// <param name="fileEntry"> FileEntry to extract </param> /// <returns> Extracted files </returns> public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor) { DiscUtils.Wim.WimFile?baseFile = null; try { baseFile = new DiscUtils.Wim.WimFile(fileEntry.Content); } catch (Exception e) { Logger.Debug(e, "Failed to init WIM image."); } if (baseFile != null) { if (options.Parallel) { var files = new ConcurrentStack <FileEntry>(); for (var i = 0; i < baseFile.ImageCount; i++) { var image = baseFile.GetImage(i); var fileList = image.GetFiles(image.Root.FullName, "*.*", SearchOption.AllDirectories).ToList(); while (fileList.Count > 0) { var batchSize = Math.Min(options.BatchSize, fileList.Count); var range = fileList.Take(batchSize); var streamsAndNames = new List <(DiscFileInfo, Stream)>(); foreach (var file in range) { try { var info = image.GetFileInfo(file); var read = info.OpenRead(); streamsAndNames.Add((info, read)); } catch (Exception e) { Logger.Debug("Error reading {0} from WIM {1} ({2}:{3})", file, image.FriendlyName, e.GetType(), e.Message); } } governor.CheckResourceGovernor(streamsAndNames.Sum(x => x.Item1.Length)); streamsAndNames.AsParallel().ForAll(file => { var newFileEntry = new FileEntry($"{image.FriendlyName}\\{file.Item1.FullName}", file.Item2, fileEntry); var entries = Context.Extract(newFileEntry, options, governor); if (entries.Any()) { files.PushRange(entries.ToArray()); } }); fileList.RemoveRange(0, batchSize); while (files.TryPop(out var result)) { if (result != null) { yield return(result); } } } } } else { for (var i = 0; i < baseFile.ImageCount; i++) { var image = baseFile.GetImage(i); foreach (var file in image.GetFiles(image.Root.FullName, "*.*", SearchOption.AllDirectories)) { Stream?stream = null; try { var info = image.GetFileInfo(file); stream = info.OpenRead(); governor.CheckResourceGovernor(info.Length); } catch (Exception e) { Logger.Debug("Error reading {0} from WIM {1} ({2}:{3})", file, image.FriendlyName, e.GetType(), e.Message); } if (stream != null) { var name = file.Replace('\\', Path.DirectorySeparatorChar); var newFileEntry = await FileEntry.FromStreamAsync($"{image.FriendlyName}{Path.DirectorySeparatorChar}{name}", stream, fileEntry); await foreach (var entry in Context.ExtractAsync(newFileEntry, options, governor)) { yield return(entry); } stream.Dispose(); } } } } } else { if (options.ExtractSelfOnFail) { yield return(fileEntry); } } }
private string?GetZipPassword(FileEntry fileEntry, ZipFile zipFile, ZipEntry zipEntry, ExtractorOptions options) { foreach (var passwords in options.Passwords.Where(x => x.Key.IsMatch(fileEntry.Name))) { foreach (var password in passwords.Value) { zipFile.Password = password; try { using var zipStream = zipFile.GetInputStream(zipEntry); return(password); } catch (Exception e) { Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.ZIP, fileEntry.FullPath, zipEntry.Name, e.GetType()); } } } return(null); }