Ejemplo n.º 1
0
        /// <summary>
        ///     Extracts an Gzip file contained in fileEntry. Since this function is recursive, even though
        ///     Gzip only supports a single compressed file, that inner file could itself contain multiple others.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            GZipArchive?gzipArchive = null;

            try
            {
                gzipArchive = GZipArchive.Open(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.GZIP, fileEntry.FullPath, string.Empty, e.GetType());
            }
            if (gzipArchive != null)
            {
                foreach (var entry in gzipArchive.Entries)
                {
                    if (entry.IsDirectory)
                    {
                        continue;
                    }

                    governor.CheckResourceGovernor(entry.Size);

                    var newFilename = Path.GetFileNameWithoutExtension(fileEntry.Name);
                    if (fileEntry.Name.EndsWith(".tgz", StringComparison.InvariantCultureIgnoreCase))
                    {
                        newFilename = newFilename[0..^ 4] + ".tar";
Ejemplo n.º 2
0
        /// <summary>
        ///     Extracts a 7-Zip file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>

        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            var sevenZipArchive = GetSevenZipArchive(fileEntry, options);

            if (sevenZipArchive != null)
            {
                var entries = sevenZipArchive.Entries.Where(x => !x.IsDirectory && x.IsComplete).ToList();

                foreach (var entry in entries)
                {
                    governor.CheckResourceGovernor(entry.Size);
                    var name = entry.Key.Replace('/', Path.DirectorySeparatorChar);

                    var newFileEntry = await FileEntry.FromStreamAsync(name, entry.OpenEntryStream(), fileEntry);

                    if (Extractor.IsQuine(newFileEntry))
                    {
                        Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                        throw new OverflowException();
                    }
                    await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor))
                    {
                        yield return(extractedFile);
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        ///     Extracts an a VHD file
        /// </summary>
        /// <param name="fileEntry"> </param>
        /// <returns> </returns>
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            using var disk = new DiscUtils.Vhd.Disk(fileEntry.Content, Ownership.None);
            LogicalVolumeInfo[]? logicalVolumes = null;

            try
            {
                var manager = new VolumeManager(disk);
                logicalVolumes = manager.GetLogicalVolumes();
            }
            catch (Exception e)
            {
                Logger.Debug("Error reading {0} disk at {1} ({2}:{3})", disk.GetType(), fileEntry.FullPath, e.GetType(), e.Message);
            }

            if (logicalVolumes != null)
            {
                foreach (var volume in logicalVolumes)
                {
                    foreach (var entry in DiscCommon.DumpLogicalVolume(volume, fileEntry.FullPath, options, governor, Context, fileEntry))
                    {
                        yield return(entry);
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 4
0
        /// <summary>
        ///     Extracts an zip file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            ZipFile?zipFile = null;

            try
            {
                zipFile = new ZipFile(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.ZIP, fileEntry.FullPath, string.Empty, e.GetType());
            }
            if (zipFile != null)
            {
                var buffer        = new byte[BUFFER_SIZE];
                var passwordFound = false;
                foreach (ZipEntry?zipEntry in zipFile)
                {
                    if (zipEntry is null ||
                        zipEntry.IsDirectory ||
                        !zipEntry.CanDecompress)
                    {
                        continue;
                    }

                    if (zipEntry.IsCrypted && !passwordFound)
                    {
                        zipFile.Password = GetZipPassword(fileEntry, zipFile, zipEntry, options) ?? string.Empty;
                        passwordFound    = true;
                    }

                    governor.CheckResourceGovernor(zipEntry.Size);

                    using var fs = new FileStream(Path.GetTempFileName(), FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite, 4096, FileOptions.DeleteOnClose);
                    try
                    {
                        var zipStream = zipFile.GetInputStream(zipEntry);
                        StreamUtils.Copy(zipStream, fs, buffer);
                    }
                    catch (Exception e)
                    {
                        Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.ZIP, fileEntry.FullPath, zipEntry.Name, e.GetType());
                    }

                    var name         = zipEntry.Name.Replace('/', Path.DirectorySeparatorChar);
                    var newFileEntry = new FileEntry(name, fs, fileEntry);

                    if (Extractor.IsQuine(newFileEntry))
                    {
                        Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                        throw new OverflowException();
                    }

                    await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor))
                    {
                        yield return(extractedFile);
                    }
                }
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        ///     Extracts an archive file created with GNU ar
        /// </summary>
        /// <param name="fileEntry"> </param>
        /// <returns> </returns>
        ///
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            IEnumerable <FileEntry>?fileEntries = null;

            try
            {
                fileEntries = ArFile.GetFileEntries(fileEntry, options, governor);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.AR, fileEntry.FullPath, string.Empty, e.GetType());
                if (e is OverflowException)
                {
                    throw;
                }
            }
            if (fileEntries != null)
            {
                if (options.Parallel)
                {
                    while (fileEntries.Any())
                    {
                        var tempStore       = new ConcurrentStack <FileEntry>();
                        var selectedEntries = fileEntries.Take(options.BatchSize);
                        selectedEntries.AsParallel().ForAll(arEntry =>
                        {
                            tempStore.PushRange(Context.ExtractFile(arEntry, options, governor).ToArray());
                        });

                        fileEntries = fileEntries.Skip(selectedEntries.Count());

                        while (tempStore.TryPop(out var result))
                        {
                            if (result != null)
                            {
                                yield return(result);
                            }
                        }
                    }
                }
                else
                {
                    foreach (var entry in fileEntries)
                    {
                        foreach (var extractedFile in Context.ExtractFile(entry, options, governor))
                        {
                            yield return(extractedFile);
                        }
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 6
0
 /// <summary>
 /// Automatically check compatibility with multiple (currently one, but there will be multiple) extractors;
 /// Return null if there is not a suitable extractor.
 /// Otherwise return a Extractor with option passed.
 /// </summary>
 public static Extractor findExtractor(ExtractorOptions option = null)
 {
     if (Tar.Check_compatibility(option))
     {
         return(new Tar(option));
     }
     return(null);
 }
Ejemplo n.º 7
0
        /// <summary>
        ///     Extracts an a Tar archive
        /// </summary>
        /// <param name="fileEntry"> </param>
        /// <returns> </returns>
        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            TarEntry       tarEntry;
            TarInputStream?tarStream = null;

            try
            {
                tarStream = new TarInputStream(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.TAR, fileEntry.FullPath, string.Empty, e.GetType());
            }
            if (tarStream != null)
            {
                while ((tarEntry = tarStream.GetNextEntry()) != null)
                {
                    if (tarEntry.IsDirectory)
                    {
                        continue;
                    }

                    var fs = new FileStream(Path.GetTempFileName(), FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite, 4096, FileOptions.DeleteOnClose);
                    governor.CheckResourceGovernor(tarStream.Length);
                    try
                    {
                        tarStream.CopyEntryContents(fs);
                    }
                    catch (Exception e)
                    {
                        Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.TAR, fileEntry.FullPath, tarEntry.Name, e.GetType());
                    }

                    var name = tarEntry.Name.Replace('/', Path.DirectorySeparatorChar);

                    var newFileEntry = new FileEntry(name, fs, fileEntry, true);

                    if (Extractor.IsQuine(newFileEntry))
                    {
                        Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                        throw new OverflowException();
                    }

                    await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor))
                    {
                        yield return(extractedFile);
                    }
                }
                tarStream.Dispose();
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 8
0
        private RarArchive?GetRarArchive(FileEntry fileEntry, ExtractorOptions options)
        {
            RarArchive?rarArchive = null;

            try
            {
                rarArchive = RarArchive.Open(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, string.Empty, e.GetType());
            }
            var needsPassword = false;

            try
            {
                using var testStream = rarArchive.Entries.First().OpenEntryStream();
            }
            catch (Exception e)
            {
                needsPassword = true;
            }
            if (needsPassword is true)
            {
                var passwordFound = false;
                foreach (var passwords in options.Passwords.Where(x => x.Key.IsMatch(fileEntry.Name)))
                {
                    if (passwordFound)
                    {
                        break;
                    }
                    foreach (var password in passwords.Value)
                    {
                        try
                        {
                            fileEntry.Content.Position = 0;
                            rarArchive = RarArchive.Open(fileEntry.Content, new SharpCompress.Readers.ReaderOptions()
                            {
                                Password = password, LookForHeader = true
                            });
                            var count = 0; //To do something in the loop
                            foreach (var entry in rarArchive.Entries)
                            {
                                //Just do anything in the loop, but you need to loop over entries to check if the password is correct
                                count++;
                            }
                            break;
                        }
                        catch (Exception e)
                        {
                            Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, string.Empty, e.GetType());
                        }
                    }
                }
            }
            return(rarArchive);
        }
Ejemplo n.º 9
0
 /// <summary>
 ///     Extracts an archive file created with GNU ar
 /// </summary>
 /// <param name="fileEntry"> </param>
 /// <returns> </returns>
 ///
 public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
 {
     await foreach (var entry in ArFile.GetFileEntriesAsync(fileEntry, options, governor))
     {
         await foreach (var extractedFile in Context.ExtractAsync(entry, options, governor))
         {
             yield return(extractedFile);
         }
     }
 }
Ejemplo n.º 10
0
        public Extractor(ExtractorOptions options, Func <IDocument, ExtractResult> transform = null)
        {
            _options  = options;
            transform = transform ?? Extract;

            _inner = new TransformBlock <IDocument, ExtractResult>(transform, new ExecutionDataflowBlockOptions
            {
                MaxDegreeOfParallelism = -1
            });
        }
Ejemplo n.º 11
0
 public Extractor(ExtractorOptions options = null)
 {
     if (options == null)
     {
         this.options = new ExtractorOptions();
     }
     else
     {
         this.options = options;
     }
 }
Ejemplo n.º 12
0
        private SevenZipArchive?GetSevenZipArchive(FileEntry fileEntry, ExtractorOptions options)
        {
            SevenZipArchive?sevenZipArchive = null;

            try
            {
                sevenZipArchive = SevenZipArchive.Open(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, string.Empty, e.GetType());
            }
            var needsPassword = false;

            try
            {
                needsPassword = sevenZipArchive?.TotalUncompressSize == 0;
            }
            catch (Exception)
            {
                needsPassword = true;
            }
            if (needsPassword is true)
            {
                var passwordFound = false;
                foreach (var passwords in options.Passwords.Where(x => x.Key.IsMatch(fileEntry.Name)))
                {
                    if (passwordFound)
                    {
                        break;
                    }
                    foreach (var password in passwords.Value)
                    {
                        try
                        {
                            sevenZipArchive = SevenZipArchive.Open(fileEntry.Content, new SharpCompress.Readers.ReaderOptions()
                            {
                                Password = password
                            });
                            if (sevenZipArchive.TotalUncompressSize > 0)
                            {
                                passwordFound = true;
                                break;
                            }
                        }
                        catch (Exception e)
                        {
                            Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, string.Empty, e.GetType());
                        }
                    }
                }
            }
            return(sevenZipArchive);
        }
Ejemplo n.º 13
0
        /// <summary>
        ///     Extracts an an ISO file
        /// </summary>
        /// <param name="fileEntry"> </param>
        /// <returns> </returns>
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            using var cd = new CDReader(fileEntry.Content, true);
            var entries = cd.Root.GetFiles("*.*", SearchOption.AllDirectories);

            if (entries != null)
            {
                if (options.Parallel)
                {
                    var files = new ConcurrentStack <FileEntry>();

                    var batchSize           = Math.Min(options.BatchSize, entries.Length);
                    var selectedFileEntries = entries[0..batchSize];
Ejemplo n.º 14
0
        /// <summary>
        ///     Extracts a WIM file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            DiscUtils.Wim.WimFile?baseFile = null;
            try
            {
                baseFile = new DiscUtils.Wim.WimFile(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(e, "Failed to init WIM image.");
            }
            if (baseFile != null)
            {
                for (var i = 0; i < baseFile.ImageCount; i++)
                {
                    var image = baseFile.GetImage(i);
                    foreach (var file in image.GetFiles(image.Root.FullName, "*.*", SearchOption.AllDirectories))
                    {
                        Stream?stream = null;
                        try
                        {
                            var info = image.GetFileInfo(file);
                            stream = info.OpenRead();
                            governor.CheckResourceGovernor(info.Length);
                        }
                        catch (Exception e)
                        {
                            Logger.Debug("Error reading {0} from WIM {1} ({2}:{3})", file, image.FriendlyName, e.GetType(), e.Message);
                        }
                        if (stream != null)
                        {
                            var name = file.Replace('\\', Path.DirectorySeparatorChar);

                            var newFileEntry = new FileEntry($"{image.FriendlyName}{Path.DirectorySeparatorChar}{name}", stream, fileEntry);
                            foreach (var entry in Context.Extract(newFileEntry, options, governor))
                            {
                                yield return(entry);
                            }
                            stream.Dispose();
                        }
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 15
0
        /// <summary>
        ///     Extracts an zip file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            XZStream?xzStream = null;

            try
            {
                xzStream = new XZStream(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.XZ, fileEntry.FullPath, string.Empty, e.GetType());
            }
            if (xzStream != null)
            {
                var newFilename  = Path.GetFileNameWithoutExtension(fileEntry.Name);
                var newFileEntry = new FileEntry(newFilename, xzStream, fileEntry);

                // SharpCompress does not expose metadata without a full read, so we need to decompress first,
                // and then abort if the bytes exceeded the governor's capacity.

                var streamLength = xzStream.Index?.Records?.Select(r => r.UncompressedSize)
                                   .Aggregate((ulong?)0, (a, b) => a + b);

                // BUG: Technically, we're casting a ulong to a long, but we don't expect 9 exabyte steams, so
                // low risk.
                if (streamLength.HasValue)
                {
                    governor.CheckResourceGovernor((long)streamLength.Value);
                }

                if (Extractor.IsQuine(newFileEntry))
                {
                    Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                    throw new OverflowException();
                }

                foreach (var extractedFile in Context.Extract(newFileEntry, options, governor))
                {
                    yield return(extractedFile);
                }
                xzStream.Dispose();
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 16
0
        public static int ExtractCommand(ExtractCommandOptions options)
        {
            var config        = new LoggingConfiguration();
            var consoleTarget = new ConsoleTarget
            {
                Name   = "console",
                Layout = "${longdate}|${level:uppercase=true}|${logger}|${message}",
            };

            if (options.Verbose)
            {
                config.AddRule(LogLevel.Trace, LogLevel.Fatal, consoleTarget, "*");
            }
            else if (options.Debug)
            {
                config.AddRule(LogLevel.Debug, LogLevel.Fatal, consoleTarget, "*");
            }
            else
            {
                config.AddRule(LogLevel.Info, LogLevel.Fatal, consoleTarget, "*");
            }

            LogManager.Configuration = config;

            var extractor        = new Extractor();
            var extractorOptions = new ExtractorOptions()
            {
                ExtractSelfOnFail = true,
                Parallel          = true,
                RawExtensions     = options.RawExtensions
            };

            if (options.Passwords?.Any() ?? false)
            {
                extractorOptions.Passwords = new Dictionary <Regex, List <string> >()
                {
                    {
                        new Regex(".*", RegexOptions.Compiled),
                        options.Passwords.ToList()
                    }
                };
            }
            var allowRegexes = options.AllowFilters?.Select(x => new Regex(x)) ?? Array.Empty <Regex>();
            var denyRegexes  = options.DenyFilters?.Select(x => new Regex(x)) ?? Array.Empty <Regex>();

            extractor.ExtractToDirectory(options.Output, options.Input, extractorOptions, allowRegexes, denyRegexes, options.PrintNames);

            return(0);
        }
        private void ParseFile(string path)
        {
            Log.Verbose("Started parsing {0}", path);
            FileSystemObject obj = FilePathToFileSystemObject(path);

            if (obj != null)
            {
                HandleChange(obj);

                // If we know how to handle this as an archive, and crawling archives is enabled
                if (opts.CrawlArchives && MiniMagic.DetectFileType(path) != ArchiveFileType.UNKNOWN)
                {
                    var opts = new ExtractorOptions()
                    {
                        ExtractSelfOnFail = false
                    };
                    Extractor extractor = new Extractor();
                    foreach (var fso in extractor.ExtractFile(path, opts).Select(fileEntry => FileEntryToFileSystemObject(fileEntry)))
                    {
                        HandleChange(fso);
                    }
                }

                // TODO: Also try parse .DER as a key
                if (path.EndsWith(".cer", StringComparison.CurrentCulture) ||
                    path.EndsWith(".der", StringComparison.CurrentCulture) ||
                    path.EndsWith(".p7b", StringComparison.CurrentCulture) ||
                    path.EndsWith(".pfx", StringComparison.CurrentCulture))
                {
                    try
                    {
                        using var certificate = new X509Certificate2(path);

                        var certObj = new CertificateObject(
                            StoreLocation: StoreLocation.LocalMachine.ToString(),
                            StoreName: StoreName.Root.ToString(),
                            Certificate: new SerializableCertificate(certificate));

                        HandleChange(certObj);
                    }
                    catch (Exception e)
                    {
                        Log.Verbose("Could not parse certificate from file: {0} ({1}:{2})", path, e.GetType(), e.Message);
                    }
                }
            }
            Log.Verbose("Finished parsing {0}", path);
        }
Ejemplo n.º 18
0
 public Tar(ExtractorOptions options) : base(options)
 {
     if (new FileInfo(options.archive_filepath).Length < min_tar_size)
     {
         throw new NotSupportedException("The provided tar file is invalid");
     }
     filepath = options.archive_filepath;
     if (options.statemap_filepath == "")
     {
         options.statemap_filepath = filepath + ".statemap";
     }
     statemapPath = options.statemap_filepath;
     states       = new Dictionary <string, TarState>();
     folderList   = new Dictionary <string, VFolder>();
     folderList.Add("/", VFolder.RootFolder);
     fileList = new Dictionary <string, VFile>();
 }
Ejemplo n.º 19
0
        ///     Extracts an BZip2 file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            BZip2Stream?bzip2Stream = null;

            try
            {
                bzip2Stream = new BZip2Stream(fileEntry.Content, SharpCompress.Compressors.CompressionMode.Decompress, false);
                governor.CheckResourceGovernor(bzip2Stream.Length);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.BZIP2, fileEntry.FullPath, string.Empty, e.GetType());
            }
            if (bzip2Stream != null)
            {
                var newFilename = Path.GetFileNameWithoutExtension(fileEntry.Name);
                var entryStream = bzip2Stream.Length > options.MemoryStreamCutoff ?
                                  new FileStream(Path.GetTempFileName(), FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite, 4096, FileOptions.DeleteOnClose) :
                                  (Stream) new MemoryStream((int)bzip2Stream.Length);
                var newFileEntry = new FileEntry(newFilename, bzip2Stream, fileEntry);

                if (Extractor.IsQuine(newFileEntry))
                {
                    Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                    bzip2Stream.Dispose();
                    throw new OverflowException();
                }

                foreach (var extractedFile in Context.Extract(newFileEntry, options, governor))
                {
                    yield return(extractedFile);
                }
                bzip2Stream.Dispose();
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 20
0
        /// <summary>
        ///     Extracts an an ISO file
        /// </summary>
        /// <param name="fileEntry"> </param>
        /// <returns> </returns>
        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            using var cd = new CDReader(fileEntry.Content, true);
            var entries = cd.Root.GetFiles("*.*", SearchOption.AllDirectories);

            if (entries != null)
            {
                foreach (var file in entries)
                {
                    var fileInfo = file;
                    governor.CheckResourceGovernor(fileInfo.Length);
                    Stream?stream = null;
                    try
                    {
                        stream = fileInfo.OpenRead();
                    }
                    catch (Exception e)
                    {
                        Logger.Debug("Failed to extract {0} from ISO {1}. ({2}:{3})", fileInfo.Name, fileEntry.FullPath, e.GetType(), e.Message);
                    }
                    if (stream != null)
                    {
                        var name         = fileInfo.Name.Replace('/', Path.DirectorySeparatorChar);
                        var newFileEntry = await FileEntry.FromStreamAsync(name, stream, fileEntry);

                        var innerEntries = Context.ExtractAsync(newFileEntry, options, governor);
                        await foreach (var entry in innerEntries)
                        {
                            yield return(entry);
                        }
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 21
0
        /// <summary>
        ///     Extracts an BZip2 file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            BZip2Stream?bzip2Stream = null;

            try
            {
                bzip2Stream = new BZip2Stream(fileEntry.Content, SharpCompress.Compressors.CompressionMode.Decompress, false);
                governor.CheckResourceGovernor(bzip2Stream.Length);
            }
            catch (Exception e)
            {
                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.BZIP2, fileEntry.FullPath, string.Empty, e.GetType());
            }
            if (bzip2Stream != null)
            {
                var newFilename  = Path.GetFileNameWithoutExtension(fileEntry.Name);
                var newFileEntry = await FileEntry.FromStreamAsync(newFilename, bzip2Stream, fileEntry);

                if (Extractor.IsQuine(newFileEntry))
                {
                    Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                    bzip2Stream.Dispose();
                    throw new OverflowException();
                }

                await foreach (var extractedFile in Context.ExtractAsync(newFileEntry, options, governor))
                {
                    yield return(extractedFile);
                }
                bzip2Stream.Dispose();
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 22
0
        /// <summary>
        /// Reads the sub resource and saves it to the given <paramref name="outputFolder"/>
        /// </summary>
        /// <param name="resources">The sub resource</param>
        /// <param name="outputFolder">The output folder where to save the resource</param>
        /// <param name="mainUri">The main uri of the web page</param>
        /// <param name="options"><see cref="ExtractorOptions"/></param>
        /// <param name="webPage">The main web page</param>
        private void ProcessSubResources(
            IDictionary resources,
            string outputFolder,
            Uri mainUri,
            ExtractorOptions options,
            ref string webPage)
        {
            Uri uri = null;

            byte[] data     = null;
            string mimeType = null;

            foreach (DictionaryEntry resource in resources)
            {
                switch (resource.Key)
                {
                case WebResourceUrl:
                    uri = new Uri((string)resource.Value);
                    break;

                case WebResourceData:
                    data = (byte[])resource.Value;
                    break;

                case WebResourceMimeType:
                    mimeType = (string)resource.Value;
                    break;
                }
            }

            if ((mimeType == "text/javascript" || mimeType == "application/javascript" ||
                 mimeType == "application/x-javascript") && options == ExtractorOptions.IgnoreJavaScriptFiles)
            {
                Logger.WriteToLog("Ignoring javascript file, replacing it with a empty string in the web page");
                ReplaceWebPageUrl(uri, mainUri, string.Empty, ref webPage);
                return;
            }

            if (data != null && uri != null && uri.LocalPath.StartsWith("/"))
            {
                var fileRelativeUri = uri.LocalPath.Replace(mainUri.AbsolutePath, string.Empty).TrimStart('/');
                var path            = Path.Combine(outputFolder, fileRelativeUri);
                var fileInfo        = new FileInfo(path);

                if (fileInfo.Exists || File.Exists(fileInfo.DirectoryName) || Directory.Exists(fileInfo.FullName))
                {
                    path = Path.Combine(outputFolder, Guid.NewGuid().ToString());
                }

                fileInfo = new FileInfo(path);

                if (!fileInfo.FullName.EndsWith(@"\"))
                {
                    fileInfo.Directory?.Create();
                    File.WriteAllBytes(fileInfo.FullName, data);
                    ReplaceWebPageUrl(uri, mainUri, fileRelativeUri, ref webPage);
                }
                else
                {
                    Logger.WriteToLog($"Ignoring url '{uri}'");
                }
            }
        }
Ejemplo n.º 23
0
        protected Crawler(IDocumentFactory documentFactory, IKeyValueStore <string, Result> store, IKeyValueStore <string, FetchTarget> frontier)
        {
            _store    = store;
            _frontier = frontier;

            var fetcherOptions = new FetcherOptions
            {
                UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
            };

            var parserOptions = new ParserOptions
            {
            };

            var scraperOptions = new ScraperOptions
            {
            };

            var extractorOptions = new ExtractorOptions
            {
            };

            //var storerOptions = new StorerOptions
            //{
            //};

            var builderOptions = new BuilderOptions
            {
            };

            var providerOptions = new ProviderOptions
            {
            };

            //var dispatcherOptions = new DispatcherOptions
            //{
            //};


            Fetcher    = new Fetcher(fetcherOptions);
            Parser     = new Parser(parserOptions, documentFactory);
            Scraper    = new Scraper(scraperOptions);
            Extractor  = new Extractor(extractorOptions);
            Storer     = new Storer(store);
            Builder    = new Builder(builderOptions);
            Provider   = new Provider(providerOptions, store, frontier);
            Dispatcher = new Dispatcher();

            Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK);

            Parser.SendTo(Scraper);
            Parser.SendTo(Extractor);

            Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK);
            Scraper.SendTo(Builder);
            Extractor.SendTo(Builder);

            Builder.SendTo(Storer);

            //Storer.LinkTo(new ActionBlock<Result>(x =>
            //{
            //}));

            Builder.SendTo(Provider);
            Provider.SendTo(Dispatcher, x => x != null);
            Dispatcher.SendTo(Fetcher);
        }
Ejemplo n.º 24
0
        /// <summary>
        ///     Extracts an a RAR archive
        /// </summary>
        /// <param name="fileEntry"> </param>
        /// <returns> </returns>
        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            var rarArchive = GetRarArchive(fileEntry, options);

            if (rarArchive != null)
            {
                var entries = rarArchive.Entries.Where(x => x.IsComplete && !x.IsDirectory);
                if (options.Parallel)
                {
                    var files = new ConcurrentStack <FileEntry>();

                    while (entries.Any())
                    {
                        var batchSize = Math.Min(options.BatchSize, entries.Count());

                        var streams = entries.Take(batchSize).Select(entry => (entry, entry.OpenEntryStream())).ToList();

                        governor.CheckResourceGovernor(streams.Sum(x => x.Item2.Length));

                        streams.AsParallel().ForAll(streampair =>
                        {
                            try
                            {
                                var newFileEntry = new FileEntry(streampair.entry.Key, streampair.Item2, fileEntry);
                                if (Extractor.IsQuine(newFileEntry))
                                {
                                    Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                                    governor.CurrentOperationProcessedBytesLeft = -1;
                                }
                                else
                                {
                                    files.PushRange(Context.Extract(newFileEntry, options, governor).ToArray());
                                }
                            }
                            catch (Exception e)
                            {
                                Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, streampair.entry.Key, e.GetType());
                            }
                        });
                        governor.CheckResourceGovernor(0);

                        entries = entries.Skip(streams.Count);

                        while (files.TryPop(out var result))
                        {
                            if (result != null)
                            {
                                yield return(result);
                            }
                        }
                    }
                }
                else
                {
                    foreach (var entry in entries)
                    {
                        governor.CheckResourceGovernor(entry.Size);
                        FileEntry?newFileEntry = null;
                        try
                        {
                            var name = entry.Key.Replace('/', Path.DirectorySeparatorChar);

                            newFileEntry = new FileEntry(name, entry.OpenEntryStream(), fileEntry);
                        }
                        catch (Exception e)
                        {
                            Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.RAR, fileEntry.FullPath, entry.Key, e.GetType());
                        }
                        if (newFileEntry != null)
                        {
                            if (Extractor.IsQuine(newFileEntry))
                            {
                                Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                                throw new OverflowException();
                            }
                            foreach (var extractedFile in Context.Extract(newFileEntry, options, governor))
                            {
                                yield return(extractedFile);
                            }
                        }
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 25
0
        /// <summary>
        /// Dump the FileEntries from a Logical Volume
        /// </summary>
        /// <param name="volume">The Volume to dump</param>
        /// <param name="parentPath">The Path to the parent Disc</param>
        /// <param name="options">Extractor Options to use</param>
        /// <param name="governor">Resource Governor to use</param>
        /// <param name="Context">Extractor context to use</param>
        /// <param name="parent">The Parent FilEntry</param>
        /// <returns></returns>
        public static IEnumerable <FileEntry> DumpLogicalVolume(LogicalVolumeInfo volume, string parentPath, ExtractorOptions options, ResourceGovernor governor, Extractor Context, FileEntry?parent = null)
        {
            DiscUtils.FileSystemInfo[]? fsInfos = null;
            try
            {
                fsInfos = FileSystemManager.DetectFileSystems(volume);
            }
            catch (Exception e)
            {
                Logger.Debug("Failed to get file systems from logical volume {0} Image {1} ({2}:{3})", volume.Identity, parentPath, e.GetType(), e.Message);
            }

            foreach (var fsInfo in fsInfos ?? Array.Empty <DiscUtils.FileSystemInfo>())
            {
                using var fs = fsInfo.Open(volume);
                var diskFiles = fs.GetFiles(fs.Root.FullName, "*.*", SearchOption.AllDirectories).ToList();
                if (options.Parallel)
                {
                    var files = new ConcurrentStack <FileEntry>();

                    while (diskFiles.Any())
                    {
                        var  batchSize   = Math.Min(options.BatchSize, diskFiles.Count);
                        var  range       = diskFiles.GetRange(0, batchSize);
                        var  fileinfos   = new List <(DiscFileInfo, Stream)>();
                        long totalLength = 0;
                        foreach (var r in range)
                        {
                            try
                            {
                                var fi = fs.GetFileInfo(r);
                                totalLength += fi.Length;
                                fileinfos.Add((fi, fi.OpenRead()));
                            }
                            catch (Exception e)
                            {
                                Logger.Debug("Failed to get FileInfo from {0} in Volume {1} @ {2} ({3}:{4})", r, volume.Identity, parentPath, e.GetType(), e.Message);
                            }
                        }

                        governor.CheckResourceGovernor(totalLength);

                        fileinfos.AsParallel().ForAll(file =>
                        {
                            if (file.Item2 != null)
                            {
                                var newFileEntry = new FileEntry($"{volume.Identity}{Path.DirectorySeparatorChar}{file.Item1.FullName}", file.Item2, parent);
                                var entries      = Context.ExtractFile(newFileEntry, options, governor);
                                files.PushRange(entries.ToArray());
                            }
                        });
                        diskFiles.RemoveRange(0, batchSize);

                        while (files.TryPop(out var result))
                        {
                            if (result != null)
                            {
                                yield return(result);
                            }
                        }
                    }
                }
                else
                {
                    foreach (var file in diskFiles)
                    {
                        Stream?fileStream = null;
                        try
                        {
                            var fi = fs.GetFileInfo(file);
                            governor.CheckResourceGovernor(fi.Length);
                            fileStream = fi.OpenRead();
                        }
                        catch (Exception e)
                        {
                            Logger.Debug(e, "Failed to open {0} in volume {1}", file, volume.Identity);
                        }
                        if (fileStream != null)
                        {
                            var newFileEntry = new FileEntry($"{volume.Identity}{Path.DirectorySeparatorChar}{file}", fileStream, parent);
                            var entries      = Context.ExtractFile(newFileEntry, options, governor);
                            foreach (var entry in entries)
                            {
                                yield return(entry);
                            }
                        }
                    }
                }
            }
        }
Ejemplo n.º 26
0
        /// <summary>
        /// Dump the FileEntries from a Logical Volume asynchronously
        /// </summary>
        /// <param name="volume">The Volume to dump</param>
        /// <param name="parentPath">The Path to the parent Disc</param>
        /// <param name="options">Extractor Options to use</param>
        /// <param name="governor">Resource Governor to use</param>
        /// <param name="Context">Extractor context to use</param>
        /// <param name="parent">The Parent FilEntry</param>
        /// <returns></returns>
        public static async IAsyncEnumerable <FileEntry> DumpLogicalVolumeAsync(LogicalVolumeInfo volume, string parentPath, ExtractorOptions options, ResourceGovernor governor, Extractor Context, FileEntry?parent = null)
        {
            DiscUtils.FileSystemInfo[]? fsInfos = null;
            try
            {
                fsInfos = FileSystemManager.DetectFileSystems(volume);
            }
            catch (Exception e)
            {
                Logger.Debug("Failed to get file systems from logical volume {0} Image {1} ({2}:{3})", volume.Identity, parentPath, e.GetType(), e.Message);
            }

            foreach (var fsInfo in fsInfos ?? Array.Empty <DiscUtils.FileSystemInfo>())
            {
                using var fs = fsInfo.Open(volume);
                var diskFiles = fs.GetFiles(fs.Root.FullName, "*.*", SearchOption.AllDirectories).ToList();

                foreach (var file in diskFiles)
                {
                    Stream?      fileStream = null;
                    DiscFileInfo?fi         = null;
                    try
                    {
                        fi = fs.GetFileInfo(file);
                        governor.CheckResourceGovernor(fi.Length);
                        fileStream = fi.OpenRead();
                    }
                    catch (Exception e)
                    {
                        Logger.Debug(e, "Failed to open {0} in volume {1}", file, volume.Identity);
                    }
                    if (fileStream != null && fi != null)
                    {
                        var newFileEntry = await FileEntry.FromStreamAsync($"{volume.Identity}{Path.DirectorySeparatorChar}{fi.FullName}", fileStream, parent);

                        var entries = Context.ExtractAsync(newFileEntry, options, governor);
                        await foreach (var entry in entries)
                        {
                            yield return(entry);
                        }
                    }
                }
            }
        }
Ejemplo n.º 27
0
        protected Crawler(IDocumentFactory documentFactory, IKeyValueStore<string, Result> store, IKeyValueStore<string, FetchTarget> frontier)
        {
            _store = store;
            _frontier = frontier;

            var fetcherOptions = new FetcherOptions
            {
                UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
            };

            var parserOptions = new ParserOptions
            {
            };

            var scraperOptions = new ScraperOptions
            {
            };

            var extractorOptions = new ExtractorOptions
            {
            };

            //var storerOptions = new StorerOptions
            //{
            //};

            var builderOptions = new BuilderOptions
            {
            };

            var providerOptions = new ProviderOptions
            {
            };

            //var dispatcherOptions = new DispatcherOptions
            //{
            //};

            Fetcher = new Fetcher(fetcherOptions);
            Parser = new Parser(parserOptions, documentFactory);
            Scraper = new Scraper(scraperOptions);
            Extractor = new Extractor(extractorOptions);
            Storer = new Storer(store);
            Builder = new Builder(builderOptions);
            Provider = new Provider(providerOptions, store, frontier);
            Dispatcher = new Dispatcher();

            Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK);

            Parser.SendTo(Scraper);
            Parser.SendTo(Extractor);

            Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK);
            Scraper.SendTo(Builder);
            Extractor.SendTo(Builder);

            Builder.SendTo(Storer);

            //Storer.LinkTo(new ActionBlock<Result>(x =>
            //{
            //}));

            Builder.SendTo(Provider);
            Provider.SendTo(Dispatcher, x => x != null);
            Dispatcher.SendTo(Fetcher);
        }
Ejemplo n.º 28
0
        /// <summary>
        ///     Extracts a 7-Zip file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>

        public IEnumerable <FileEntry> Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            var sevenZipArchive = GetSevenZipArchive(fileEntry, options);

            if (sevenZipArchive != null)
            {
                var entries = sevenZipArchive.Entries.Where(x => !x.IsDirectory && x.IsComplete).ToList();
                if (options.Parallel)
                {
                    var files = new ConcurrentStack <FileEntry>();

                    while (entries.Count() > 0)
                    {
                        var batchSize       = Math.Min(options.BatchSize, entries.Count());
                        var selectedEntries = entries.GetRange(0, batchSize).Select(entry => (entry, entry.OpenEntryStream()));
                        governor.CheckResourceGovernor(selectedEntries.Sum(x => x.entry.Size));

                        try
                        {
                            selectedEntries.AsParallel().ForAll(entry =>
                            {
                                try
                                {
                                    var name = entry.entry.Key.Replace('/', Path.DirectorySeparatorChar);

                                    var newFileEntry = new FileEntry(name, entry.Item2, fileEntry);
                                    if (Extractor.IsQuine(newFileEntry))
                                    {
                                        Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                                        governor.CurrentOperationProcessedBytesLeft = -1;
                                    }
                                    else
                                    {
                                        files.PushRange(Context.Extract(newFileEntry, options, governor).ToArray());
                                    }
                                }
                                catch (Exception e) when(e is OverflowException)
                                {
                                    Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, entry.entry.Key, e.GetType());
                                    throw;
                                }
                                catch (Exception e)
                                {
                                    Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.P7ZIP, fileEntry.FullPath, entry.entry.Key, e.GetType());
                                }
                            });
                        }
                        catch (Exception e) when(e is AggregateException)
                        {
                            if (e.InnerException?.GetType() == typeof(OverflowException))
                            {
                                throw e.InnerException;
                            }
                            throw;
                        }

                        governor.CheckResourceGovernor(0);
                        entries.RemoveRange(0, batchSize);

                        while (files.TryPop(out var result))
                        {
                            if (result != null)
                            {
                                yield return(result);
                            }
                        }
                    }
                }
                else
                {
                    foreach (var entry in entries)
                    {
                        governor.CheckResourceGovernor(entry.Size);
                        var name = entry.Key.Replace('/', Path.DirectorySeparatorChar);

                        var newFileEntry = new FileEntry(name, entry.OpenEntryStream(), fileEntry);

                        if (Extractor.IsQuine(newFileEntry))
                        {
                            Logger.Info(Extractor.IS_QUINE_STRING, fileEntry.Name, fileEntry.FullPath);
                            throw new OverflowException();
                        }
                        foreach (var extractedFile in Context.Extract(newFileEntry, options, governor))
                        {
                            yield return(extractedFile);
                        }
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 29
0
        /// <summary>
        /// Extract the given <paramref name="inputFile"/> to the given <paramref name="outputFolder"/>
        /// </summary>
        /// <param name="inputFile">The input file</param>
        /// <param name="outputFolder">The folder where to save the extracted web archive</param>
        /// <param name="options"><see cref="ExtractorOptions"/></param>
        /// <param name="logStream">When set then logging is written to this stream</param>
        /// <returns></returns>
        /// <exception cref="WAEInvalidFile">Raised when a required resource is not found in the web archive</exception>
        /// <exception cref="WAEResourceMissing">Raised when a required resource is not found in the web archive</exception>
        /// <exception cref="FileNotFoundException">Raised when the <paramref name="inputFile"/> is not found</exception>
        /// <exception cref="DirectoryNotFoundException">Raised when the <paramref name="outputFolder"/> does not exist</exception>
        public string Extract(string inputFile, string outputFolder, ExtractorOptions options = ExtractorOptions.None, Stream logStream = null)
        {
            if (logStream != null)
            {
                Logger.LogStream = logStream;
            }

            try
            {
                if (!Directory.Exists(outputFolder))
                {
                    throw new DirectoryNotFoundException($"The output folder '{outputFolder}' does not exist");
                }

                var reader = new PList.BinaryPlistReader();

                IDictionary archive;

                try
                {
                    archive = reader.ReadObject(inputFile);
                }
                catch (Exception exception)
                {
                    throw new WAEInvalidFile($"The file '{inputFile}' is not a valid Safari web archive", exception);
                }

                if (!archive.Contains(WebMainResource))
                {
                    var message = $"Can't find the resource '{WebMainResource}' in the webarchive";
                    Logger.WriteToLog(message);
                    throw new WAEResourceMissing(message);
                }

                var mainResource    = (IDictionary)archive[WebMainResource];
                var webPageFileName = Path.Combine(outputFolder, "webpage.html");

                Logger.WriteToLog($"Getting main web page from '{WebMainResource}'");
                var webPage = ProcessMainResource(mainResource, out var mainUri);

#if (DEBUG)
                File.WriteAllText(webPageFileName, webPage);
#endif

                if (!archive.Contains(WebSubresources))
                {
                    Logger.WriteToLog("Web archive does not contain any sub resources");
                }
                else
                {
                    var subResources = (object[])archive[WebSubresources];
                    var count        = subResources.Length;
                    Logger.WriteToLog($"Web archive has {count} sub resource{(count > 1 ? "s" : string.Empty)}");

                    foreach (IDictionary subResource in subResources)
                    {
                        ProcessSubResources(subResource, outputFolder, mainUri, options, ref webPage);
                    }
                }

                if (!archive.Contains(WebSubframeArchives))
                {
                    Logger.WriteToLog("Web archive does not contain any sub frame archives");
                }
                else
                {
                    var subFrameResources      = (object[])archive[WebSubframeArchives];
                    var subFrameResourcesCount = subFrameResources.Length;

                    Logger.WriteToLog($"Web archive has {subFrameResourcesCount} sub frame resource{(subFrameResourcesCount > 1 ? "s" : string.Empty)}");

                    var i = 1;

                    foreach (IDictionary subFrameResource in subFrameResources)
                    {
                        var subFrameMainResource = (IDictionary)subFrameResource[WebMainResource];

                        Logger.WriteToLog($"Getting web page from sub frame resource '{WebMainResource}'");
                        var subFrameResourceWebPage = ProcessSubFrameMainResource(subFrameMainResource, out var frameName, out var subFrameMainUri);
                        var subFrameOutputFolder    = Path.Combine(outputFolder, $"subframe_{i}");

                        Logger.WriteToLog($"Creating folder '{subFrameOutputFolder}' for iframe '{frameName}' content");
                        Directory.CreateDirectory(subFrameOutputFolder);
                        i += 1;

                        var subFrameSubResources = (object[])subFrameResource[WebSubresources];

                        if (subFrameSubResources == null)
                        {
                            Logger.WriteToLog("Web archive sub frame does not contain any sub resources");
                        }
                        else
                        {
                            var subFrameSubResourcesCount = subFrameSubResources.Length;
                            Logger.WriteToLog($"Web archive sub frame has {subFrameSubResourcesCount} sub resource{(subFrameSubResourcesCount > 1 ? "s" : string.Empty)}");

                            foreach (IDictionary subFrameSubResource in subFrameSubResources)
                            {
                                ProcessSubResources(subFrameSubResource, subFrameOutputFolder, subFrameMainUri, options,
                                                    ref subFrameResourceWebPage);
                            }
                        }

                        var subFrameWebPageFileName    = Path.Combine(subFrameOutputFolder, "webpage.html");
                        var subFrameWebPageRelativeUri = $"subframe_{i}/webpage.html";
                        var subFrameUri = subFrameMainUri.ToString();
                        var subFrameUriWithoutScheme  = subFrameUri.Replace($"{subFrameMainUri.Scheme}:", string.Empty);
                        var subFrameUriWithoutMainUri = subFrameUri.Replace($"{mainUri.Scheme}://{mainUri.Host}{mainUri.AbsolutePath}", string.Empty);

                        if (webPage.Contains(subFrameUri))
                        {
                            Logger.WriteToLog($"Replacing '{subFrameUri}' with '{subFrameWebPageRelativeUri}'");
                            webPage = webPage.Replace(subFrameUri, subFrameWebPageRelativeUri);
                        }
                        else if (webPage.Contains(subFrameUriWithoutScheme))
                        {
                            Logger.WriteToLog($"Replacing '{subFrameUriWithoutScheme}' with '{subFrameWebPageRelativeUri}'");
                            webPage = webPage.Replace(subFrameUriWithoutScheme, $"{subFrameWebPageRelativeUri}");
                        }
                        else if (webPage.Contains(subFrameUriWithoutMainUri))
                        {
                            Logger.WriteToLog($"Replacing '{subFrameUriWithoutMainUri}' with '{subFrameWebPageRelativeUri}'");
                            webPage = webPage.Replace(subFrameUriWithoutMainUri, $"{subFrameWebPageRelativeUri}");
                        }
                        else
                        {
                            Logger.WriteToLog($"Could not find any resources with url '{subFrameUri}' in the web page");
                        }

                        File.WriteAllText(subFrameWebPageFileName, subFrameResourceWebPage);
                    }
                }

                File.WriteAllText(webPageFileName, webPage);
                return(webPageFileName);
            }
            catch (Exception exception)
            {
                Logger.WriteToLog(ExceptionHelpers.GetInnerException(exception));
                throw;
            }
        }
Ejemplo n.º 30
0
        /// <summary>
        ///     Extracts a WIM file contained in fileEntry.
        /// </summary>
        /// <param name="fileEntry"> FileEntry to extract </param>
        /// <returns> Extracted files </returns>
        public async IAsyncEnumerable <FileEntry> ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor)
        {
            DiscUtils.Wim.WimFile?baseFile = null;
            try
            {
                baseFile = new DiscUtils.Wim.WimFile(fileEntry.Content);
            }
            catch (Exception e)
            {
                Logger.Debug(e, "Failed to init WIM image.");
            }
            if (baseFile != null)
            {
                if (options.Parallel)
                {
                    var files = new ConcurrentStack <FileEntry>();

                    for (var i = 0; i < baseFile.ImageCount; i++)
                    {
                        var image    = baseFile.GetImage(i);
                        var fileList = image.GetFiles(image.Root.FullName, "*.*", SearchOption.AllDirectories).ToList();
                        while (fileList.Count > 0)
                        {
                            var batchSize       = Math.Min(options.BatchSize, fileList.Count);
                            var range           = fileList.Take(batchSize);
                            var streamsAndNames = new List <(DiscFileInfo, Stream)>();
                            foreach (var file in range)
                            {
                                try
                                {
                                    var info = image.GetFileInfo(file);
                                    var read = info.OpenRead();
                                    streamsAndNames.Add((info, read));
                                }
                                catch (Exception e)
                                {
                                    Logger.Debug("Error reading {0} from WIM {1} ({2}:{3})", file, image.FriendlyName, e.GetType(), e.Message);
                                }
                            }
                            governor.CheckResourceGovernor(streamsAndNames.Sum(x => x.Item1.Length));
                            streamsAndNames.AsParallel().ForAll(file =>
                            {
                                var newFileEntry = new FileEntry($"{image.FriendlyName}\\{file.Item1.FullName}", file.Item2, fileEntry);
                                var entries      = Context.Extract(newFileEntry, options, governor);
                                if (entries.Any())
                                {
                                    files.PushRange(entries.ToArray());
                                }
                            });
                            fileList.RemoveRange(0, batchSize);

                            while (files.TryPop(out var result))
                            {
                                if (result != null)
                                {
                                    yield return(result);
                                }
                            }
                        }
                    }
                }
                else
                {
                    for (var i = 0; i < baseFile.ImageCount; i++)
                    {
                        var image = baseFile.GetImage(i);
                        foreach (var file in image.GetFiles(image.Root.FullName, "*.*", SearchOption.AllDirectories))
                        {
                            Stream?stream = null;
                            try
                            {
                                var info = image.GetFileInfo(file);
                                stream = info.OpenRead();
                                governor.CheckResourceGovernor(info.Length);
                            }
                            catch (Exception e)
                            {
                                Logger.Debug("Error reading {0} from WIM {1} ({2}:{3})", file, image.FriendlyName, e.GetType(), e.Message);
                            }
                            if (stream != null)
                            {
                                var name         = file.Replace('\\', Path.DirectorySeparatorChar);
                                var newFileEntry = await FileEntry.FromStreamAsync($"{image.FriendlyName}{Path.DirectorySeparatorChar}{name}", stream, fileEntry);

                                await foreach (var entry in Context.ExtractAsync(newFileEntry, options, governor))
                                {
                                    yield return(entry);
                                }
                                stream.Dispose();
                            }
                        }
                    }
                }
            }
            else
            {
                if (options.ExtractSelfOnFail)
                {
                    yield return(fileEntry);
                }
            }
        }
Ejemplo n.º 31
0
 private string?GetZipPassword(FileEntry fileEntry, ZipFile zipFile, ZipEntry zipEntry, ExtractorOptions options)
 {
     foreach (var passwords in options.Passwords.Where(x => x.Key.IsMatch(fileEntry.Name)))
     {
         foreach (var password in passwords.Value)
         {
             zipFile.Password = password;
             try
             {
                 using var zipStream = zipFile.GetInputStream(zipEntry);
                 return(password);
             }
             catch (Exception e)
             {
                 Logger.Debug(Extractor.DEBUG_STRING, ArchiveFileType.ZIP, fileEntry.FullPath, zipEntry.Name, e.GetType());
             }
         }
     }
     return(null);
 }