Esempio n. 1
0
        private async Task ReadInputs(IEnumerable <string> inputs, WarcReader reader)
        {
            foreach (var input in inputs)
            {
                await _writer.Info($"Considering input file '{input}'.");

                if (!File.Exists(input))
                {
                    await _writer.Warn($"Input file '{input}' does not exist and will be ignored.");
                }
                else
                {
                    await using var fileStream = File.OpenRead(input);

                    if (input.EndsWith(".gz"))
                    {
                        await using var compressionStream = new GZipStream(fileStream, CompressionMode.Decompress);
                        await reader.WriteAsync(compressionStream);
                    }
                    else
                    {
                        await reader.WriteAsync(fileStream);
                    }

                    await _writer.Info($"Input file '{input}' has been completely buffered.");
                }
            }

            await reader.CompleteWriting();
        }
Esempio n. 2
0
        public async Task Can_read_mixed_records()
        {
            var inputStr = Example.Get("wikipedia-1-0.warc");

            var reader      = new WarcReader();
            var writingTask = reader.WriteAsync(inputStr).ContinueWith(_ => reader.CompleteWriting());

            // Act
            var result = await ToList(reader.ReadAllAsync());

            // Assert
            result.Should().HaveCount(24);

            await writingTask;
        }
Esempio n. 3
0
        public async Task Can_read_single_record()
        {
            var inputStr = Example.Get("warcinfo.warc");

            var reader = new WarcReader();
            await reader.WriteAsync(inputStr);

            await reader.CompleteWriting();

            // Act
            var result = await ToList(reader.ReadAllAsync());

            // Assert
            result.Should().HaveCount(1);
        }
Esempio n. 4
0
        public async Task Invoke(AnalyzeVerb verb)
        {
            var reader = new WarcReader();

            var bufferingTask = ReadInputs(verb.Inputs, reader);

            var urlsToDomain = new Dictionary <string, HashSet <string> >();
            var totalFiles   = 0;

            await foreach (var warcRecord in reader.ReadAllAsync())
            {
                var headers = warcRecord.Header.Fields.ToDictionary(x => x.Name, x => x.Value, StringComparer.OrdinalIgnoreCase);
                var type    = headers["WARC-Type"];

                if (type != null &&
                    type.Equals("response", StringComparison.OrdinalIgnoreCase))
                {
                    var targetUri = headers["WARC-Target-URI"] !;
                    if (targetUri.StartsWith("<") && targetUri.EndsWith(">"))
                    {
                        targetUri = targetUri.Substring(1, targetUri.Length - 2);
                    }

                    var uri = new Uri(targetUri);
                    if (!urlsToDomain.ContainsKey(uri.Host))
                    {
                        urlsToDomain[uri.Host] = new HashSet <string>();
                    }

                    urlsToDomain[uri.Host].Add(uri.PathAndQuery);
                    totalFiles++;
                }
            }

            foreach (var(domain, urls) in urlsToDomain)
            {
                await _writer.Info($"'{domain}': {urls.Count}");
            }

            await _writer.Info($"Total URL's: {totalFiles}");

            await bufferingTask;
        }
Esempio n. 5
0
        public async Task Can_read_multiple_records()
        {
            var inputStr1 = Example.Get("warcinfo.warc");
            var inputStr2 = Example.Get("warcinfo.warc");

            var reader = new WarcReader();
            await reader.WriteAllAsync(new[]
            {
                inputStr1,
                inputStr2,
            });

            await reader.CompleteWriting();

            // Act
            var result = await ToList(reader.ReadAllAsync());

            // Assert
            result.Should().HaveCount(2);
        }
Esempio n. 6
0
        public async Task Invoke(WriteFilesVerb filesVerb)
        {
            var outputFilePrefix = Path.GetFullPath(filesVerb.Output ?? "./");

            var reader = new WarcReader();

            var bufferingTask = ReadInputs(filesVerb.Inputs, reader);

            await foreach (var warcRecord in reader.ReadAllAsync())
            {
                var headers = warcRecord.Header.Fields.ToDictionary(x => x.Name, x => x.Value, StringComparer.OrdinalIgnoreCase);
                var type    = headers["WARC-Type"];

                if (type != null &&
                    type.Equals("response", StringComparison.OrdinalIgnoreCase))
                {
                    var targetUri = headers["WARC-Target-URI"] !;
                    if (targetUri.StartsWith("<") && targetUri.EndsWith(">"))
                    {
                        targetUri = targetUri.Substring(1, targetUri.Length - 2);
                    }

                    var domain = new Uri(targetUri).Host;

                    var targetFilePath = new Uri(targetUri).AbsolutePath;

                    // /post/ -> /post/index.html
                    // / -> /index.html
                    if (targetFilePath.EndsWith("/"))
                    {
                        targetFilePath = targetFilePath + "index.html";
                    }

                    // /index.html -> index.html
                    if (targetFilePath.StartsWith("/"))
                    {
                        targetFilePath = targetFilePath.Substring(1);
                    }

                    // // /index.html -> example.com/index.html
                    targetFilePath = Path.GetFullPath(Path.Join(outputFilePrefix, domain, targetFilePath));

                    if (!targetFilePath.StartsWith(outputFilePrefix))
                    {
                        throw new Exception($"Possible dangerous path transverse detected for '{targetUri}'.");
                    }

                    await _writer.Info($"Writing file '{targetFilePath}' from '{targetUri}'.");

                    if (File.Exists(targetFilePath))
                    {
                        await _writer.Warn($"File '{targetFilePath}' already exists and will be overwritten.");
                    }

                    try
                    {
                        var directory = new FileInfo(targetFilePath).DirectoryName;
                        if (!Directory.Exists(directory))
                        {
                            Directory.CreateDirectory(directory !);
                        }

                        await using var file = File.OpenWrite(targetFilePath);
                        await file.WriteAsync(warcRecord.Payload.Data);
                    }
                    catch (Exception e)
                    {
                        await _writer.Warn($"An error occurred while writing '{targetFilePath}', moving to next. {e}");
                    }
                }
            }

            await bufferingTask;
        }