public async Task Can_read_mixed_records() { var inputStr = Example.Get("wikipedia-1-0.warc"); var reader = new WarcReader(); var writingTask = reader.WriteAsync(inputStr).ContinueWith(_ => reader.CompleteWriting()); // Act var result = await ToList(reader.ReadAllAsync()); // Assert result.Should().HaveCount(24); await writingTask; }
public async Task Can_read_single_record() { var inputStr = Example.Get("warcinfo.warc"); var reader = new WarcReader(); await reader.WriteAsync(inputStr); await reader.CompleteWriting(); // Act var result = await ToList(reader.ReadAllAsync()); // Assert result.Should().HaveCount(1); }
public async Task Invoke(AnalyzeVerb verb) { var reader = new WarcReader(); var bufferingTask = ReadInputs(verb.Inputs, reader); var urlsToDomain = new Dictionary <string, HashSet <string> >(); var totalFiles = 0; await foreach (var warcRecord in reader.ReadAllAsync()) { var headers = warcRecord.Header.Fields.ToDictionary(x => x.Name, x => x.Value, StringComparer.OrdinalIgnoreCase); var type = headers["WARC-Type"]; if (type != null && type.Equals("response", StringComparison.OrdinalIgnoreCase)) { var targetUri = headers["WARC-Target-URI"] !; if (targetUri.StartsWith("<") && targetUri.EndsWith(">")) { targetUri = targetUri.Substring(1, targetUri.Length - 2); } var uri = new Uri(targetUri); if (!urlsToDomain.ContainsKey(uri.Host)) { urlsToDomain[uri.Host] = new HashSet <string>(); } urlsToDomain[uri.Host].Add(uri.PathAndQuery); totalFiles++; } } foreach (var(domain, urls) in urlsToDomain) { await _writer.Info($"'{domain}': {urls.Count}"); } await _writer.Info($"Total URL's: {totalFiles}"); await bufferingTask; }
public async Task Can_read_multiple_records() { var inputStr1 = Example.Get("warcinfo.warc"); var inputStr2 = Example.Get("warcinfo.warc"); var reader = new WarcReader(); await reader.WriteAllAsync(new[] { inputStr1, inputStr2, }); await reader.CompleteWriting(); // Act var result = await ToList(reader.ReadAllAsync()); // Assert result.Should().HaveCount(2); }
public async Task Invoke(WriteFilesVerb filesVerb) { var outputFilePrefix = Path.GetFullPath(filesVerb.Output ?? "./"); var reader = new WarcReader(); var bufferingTask = ReadInputs(filesVerb.Inputs, reader); await foreach (var warcRecord in reader.ReadAllAsync()) { var headers = warcRecord.Header.Fields.ToDictionary(x => x.Name, x => x.Value, StringComparer.OrdinalIgnoreCase); var type = headers["WARC-Type"]; if (type != null && type.Equals("response", StringComparison.OrdinalIgnoreCase)) { var targetUri = headers["WARC-Target-URI"] !; if (targetUri.StartsWith("<") && targetUri.EndsWith(">")) { targetUri = targetUri.Substring(1, targetUri.Length - 2); } var domain = new Uri(targetUri).Host; var targetFilePath = new Uri(targetUri).AbsolutePath; // /post/ -> /post/index.html // / -> /index.html if (targetFilePath.EndsWith("/")) { targetFilePath = targetFilePath + "index.html"; } // /index.html -> index.html if (targetFilePath.StartsWith("/")) { targetFilePath = targetFilePath.Substring(1); } // // /index.html -> example.com/index.html targetFilePath = Path.GetFullPath(Path.Join(outputFilePrefix, domain, targetFilePath)); if (!targetFilePath.StartsWith(outputFilePrefix)) { throw new Exception($"Possible dangerous path transverse detected for '{targetUri}'."); } await _writer.Info($"Writing file '{targetFilePath}' from '{targetUri}'."); if (File.Exists(targetFilePath)) { await _writer.Warn($"File '{targetFilePath}' already exists and will be overwritten."); } try { var directory = new FileInfo(targetFilePath).DirectoryName; if (!Directory.Exists(directory)) { Directory.CreateDirectory(directory !); } await using var file = File.OpenWrite(targetFilePath); await file.WriteAsync(warcRecord.Payload.Data); } catch (Exception e) { await _writer.Warn($"An error occurred while writing '{targetFilePath}', moving to next. {e}"); } } } await bufferingTask; }