Пример #1
0
        public WarcItem ToWarcItem(string folder, ref byte[] fileNameBytes, ref string fileNameString)
        {
            string fn;

            if (fileNameBytes != null && fileNameBytes.Length == this.FileName.Length && this.FileName.Bytes.BlockEquals((ReadOnlySpan <byte>)fileNameBytes.Slice()))
            {
                fn = fileNameString;
            }
            else
            {
                fileNameBytes = new byte[this.FileName.Length];
                this.FileName.Bytes.CopyTo(fileNameBytes);
                fileNameString = Path.Combine(folder, this.FileName.ToString());
                fn             = fileNameString;
            }

            return(new WarcItem()
            {
                Url = this.OriginalUrl.ToString(),
                CompressedOffset = Utf8Utils.ParseInt64(this.CompressedArcFileOffset),
                CompressedLength = Utf8Utils.ParseInt64(this.CompressedRecordSize),
                Date = ParseDate(this.Date),
                PayloadLength = this.PayloadLength.Length != 0 ? Utf8Utils.ParseInt64(this.PayloadLength) : -1,
                WarcFile = fn,
                LastModified = this.LastModified.Length > 1 ? ParseDate(this.LastModified) : (DateTime?)null,
                ContentType = this.MimeTypeOfOriginalDocument.ToStringCached(),
                ResponseCode = this.ResponseCode.Length != 0 ? (HttpStatusCode)Utf8Utils.ParseInt32(this.ResponseCode) : default(HttpStatusCode),
            });
        }
Пример #2
0
        private DateTime ParseDate(Utf8String date)
        {
            if (date.Length == 14)
            {
                return(Utf8Utils.ParseDateConcatenated(date));
            }
            var num = Utf8Utils.ParseInt64(date);

            if (num < 1980_00_00_000000)
            {
                return(new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc).AddSeconds(num));
            }
            throw new ArgumentException();
        }
Пример #3
0
        public static void GenerateCdx(string cdx, IEnumerable <string> warcs)
        {
            var scratchpad = new Scratchpad();
            var buf        = new byte[16 * 1024];

            using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read))
            {
                using (var writer = new Utf8StreamWriter(output))
                {
                    writer.WriteClrStringLine(WarcColumns);
                    foreach (var warc in warcs)
                    {
                        Console.WriteLine(Path.GetFileName(warc));

                        using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read))
                        {
                            try
                            {
                                var warcname = new Utf8String(Path.GetFileName(warc));
                                while (warcStream.Position != warcStream.Length)
                                {
                                    var  startPosition     = warcStream.Position;
                                    long warcContentLength = -1;
                                    long payloadLength     = -1;
                                    int  responseCode      = -1;
                                    var  contentType       = Utf8String.Empty;
                                    var  date = scratchpad.Use(14);
                                    date[0] = 0;
                                    Utf8String url          = Utf8String.Empty;
                                    Utf8String shamanUrl    = Utf8String.Empty;
                                    DateTime?  lastModified = null;
                                    bool       isresponse   = false;
                                    using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true))
                                    {
                                        using (var reader = new Utf8StreamReader(gz, true))
                                        {
                                            while (true)
                                            {
                                                if (reader.IsCompleted)
                                                {
                                                    throw new EndOfStreamException();
                                                }
                                                var line = reader.ReadLine();
                                                if (line.Length == 0)
                                                {
                                                    break;
                                                }
                                                if (line.Equals(Warc_Response))
                                                {
                                                    isresponse = true;
                                                }
                                                if (line.StartsWith(Warc_ContentLength))
                                                {
                                                    warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line));
                                                }
                                                else if (line.StartsWith(Warc_Date))
                                                {
                                                    var val = WarcItem.GetHeaderValue(line).Bytes;
                                                    val.Slice(0, 4).CopyTo(date.Slice(0));
                                                    val.Slice(5, 2).CopyTo(date.Slice(4));
                                                    val.Slice(8, 2).CopyTo(date.Slice(6));
                                                    val.Slice(11, 2).CopyTo(date.Slice(8));
                                                    val.Slice(14, 2).CopyTo(date.Slice(10));
                                                    val.Slice(17, 2).CopyTo(date.Slice(12));
                                                }
                                                else if (line.StartsWith(Warc_URL))
                                                {
                                                    url = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                                if (line.StartsWith(Warc_Shaman_URI))
                                                {
                                                    shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                            }
                                            if (warcContentLength == -1)
                                            {
                                                throw new InvalidOperationException();
                                            }


                                            if (isresponse)
                                            {
                                                using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null))
                                                {
                                                    long l = 0;
                                                    while (true)
                                                    {
                                                        var m = s.Read(buf, 0, buf.Length);
                                                        if (m == 0)
                                                        {
                                                            break;
                                                        }
                                                        l += m;
                                                    }
                                                    payloadLength = l;
                                                    if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength)
                                                    {
                                                        throw new Exception("Content-Length mismatch.");
                                                    }
                                                }
                                                //var httpData = new LimitedStream(reader, contentLength);
                                                var cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                var lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }


                                                cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                //if (reader.ReadByte() != 13) throw new Exception();
                                                //if (reader.ReadByte() != 10) throw new Exception();
                                            }
                                            else
                                            {
                                                var remaining = warcContentLength;
                                                while (remaining != 0)
                                                {
                                                    var m = reader.Read((int)Math.Min(remaining, int.MaxValue));
                                                    if (m.Count == 0)
                                                    {
                                                        throw new Exception();
                                                    }
                                                    remaining -= m.Count;
                                                }

                                                var e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (!reader.IsCompleted)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                            }

                                            //var r = reader.RemainingBufferedData;
                                            var end = reader.ReadByte();
                                            if (end != -1)
                                            {
                                                throw new InvalidDataException();
                                            }
                                            //Console.WriteLine($"Remaining: {r.Length}");
                                        }



                                        warcStream.Position -= GetRemainingUnusedBytes(gz);
                                    }



                                    if (isresponse)
                                    {
                                        if (shamanUrl.Length > 0)
                                        {
                                            writer.Write(shamanUrl);
                                        }
                                        else
                                        {
                                            writer.Write(url);
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(startPosition);
                                        writer.Write((byte)' ');
                                        writer.Write(warcStream.Position - startPosition);
                                        writer.Write((byte)' ');
                                        if (date[0] != 0)
                                        {
                                            writer.Write(date);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(warcname);
                                        writer.Write((byte)' ');
                                        if (responseCode != -1)
                                        {
                                            writer.Write(responseCode);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(contentType);
                                        writer.Write((byte)' ');
                                        writer.Write(payloadLength);
                                        writer.Write((byte)' ');
                                        if (lastModified != null)
                                        {
                                            WriteDate(writer, lastModified.Value);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');

                                        writer.WriteLine();
                                    }
                                    scratchpad.Reset();
                                }
                            }
                            catch
                            {
                                if (warcStream.Position == warcStream.Length)
                                {
                                    Console.WriteLine("WARNING: truncated WARC.");;
                                }
                                else
                                {
                                    throw;
                                }
                            }
                        }
                    }
                }
            }
            File.Delete(cdx);
            File.Move(cdx + ".tmp", cdx);
        }
Пример #4
0
        public static Stream OpenHttp(Utf8StreamReader httpReader, Scratchpad scratchpad, Uri requestedUrl, long responseLength, out long payloadLength, out Uri location, out int responseCode, out Utf8String contentType, out DateTime?lastModified, Action <Utf8String, Utf8String> onHttpHeader)
        {
            var startPosition = httpReader.Position;

            payloadLength = -1;
            location      = null;
            lastModified  = null;

            bool chunked      = false;
            bool gzipped      = false;
            bool brotli       = false;
            var  responseLine = httpReader.ReadLine();

            responseCode = (int)Utf8Utils.ParseInt64(responseLine.TryCaptureBetween((byte)' ', (byte)' ') ?? responseLine.CaptureAfter((byte)' '));
            while (true)
            {
                var line = httpReader.ReadLine();
                if (httpReader.IsCompleted)
                {
                    throw new InvalidDataException();
                }
                if (line.Length == 0)
                {
                    break;
                }
                if (onHttpHeader != null)
                {
                    var d = line.IndexOf((byte)':');
                    onHttpHeader(line.Substring(0, d).Trim(), line.Substring(d + 1).Trim());
                }
                if (line.StartsWith(Http_TransferEncoding))
                {
                    var value = GetHeaderValue(line);
                    if (value.Equals("chunked"))
                    {
                        chunked = true;
                    }
                }
                else if (line.StartsWith(Http_ContentLength))
                {
                    payloadLength = Utf8Utils.ParseInt64(GetHeaderValue(line));
                }
                else if (line.StartsWith(Http_ContentEncoding))
                {
                    var value = GetHeaderValue(line);
                    if (value == Http_Gzip)
                    {
                        gzipped = true;
                    }
                    else if (value == Http_Brotli)
                    {
                        brotli = true;
                    }
                }
                else if (line.StartsWith(Http_Location))
                {
                    var val = GetHeaderValue(line).ToString();
                    try
                    {
                        if (val.StartsWith("//"))
                        {
                            location = new Uri(requestedUrl.Scheme + ":" + val);
                        }
                        else
                        {
                            location = new Uri(requestedUrl, val);
                        }
                    }
                    catch (Exception ex)
                    {
                    }
                }
                else if (line.StartsWith(Http_ContentType) && scratchpad != null)
                {
                    var value = GetHeaderValue(line);
                    value       = value.TryCaptureBefore((byte)' ') ?? value;
                    value       = value.TryCaptureBefore((byte)';') ?? value;
                    contentType = scratchpad.Copy(value);
                }
                else if (line.StartsWith(Http_LastModified))
                {
                    try
                    {
                        lastModified = WarcCdxItemRaw.ParseHttpDate(GetHeaderValue(line));
                    }
                    catch { }
                }
            }

            var compressed = gzipped || brotli;

            if (compressed || chunked)
            {
                payloadLength = -1;
            }
            Stream s;

            if (responseLength != -1)
            {
                var currentPos     = httpReader.Position - startPosition;
                var httpBodyLength = responseLength - currentPos;

                if (!compressed && !chunked && payloadLength != -1 && httpBodyLength != payloadLength)
                {
                    throw new Exception("Unexpected Content-Length.");
                }
                s = new LimitedStream(httpReader, httpBodyLength);
            }
            else
            {
                s = httpReader;
            }
            if (chunked)
            {
                s = new ChunkedStream(s);
            }
            if (compressed && chunked)
            {
                s = new OnDisposeConsumeStream(s);
            }

            if (gzipped)
            {
                s = new GZipStream(s, CompressionMode.Decompress);
            }
            else if (brotli)
            {
                s = new BrotliStream(s, CompressionMode.Decompress);
            }

            return(s);
        }
Пример #5
0
 private static long ReadWarcRecordContentLength(Utf8String warcHeader)
 {
     return(Utf8Utils.ParseInt64(warcHeader.CaptureBetween((Utf8String)"Content-Length:", (Utf8String)"\n").Trim()));
 }