Exemple #1
0
        public static IEnumerable <WarcCdxItemRaw> Read(Stream cdxStream, bool gzipped)
        {
            List <Action <WarcCdxItemRaw, Utf8String> > fieldSetters = null;

            using (cdxStream)
            {
                while (true)
                {
                    var textStream = gzipped ? new GZipStream(cdxStream, CompressionMode.Decompress, true) : cdxStream;

                    using (var reader = new Utf8StreamReader(textStream, true))
                    {
                        if (fieldSetters == null)
                        {
                            var fields = reader.ReadLine().Split((byte)' ');
                            InitializeSetters();
                            fieldSetters = new List <Action <WarcCdxItemRaw, Utf8String> >();

                            var foundCdx = false;
                            foreach (var label in fields)
                            {
                                if (!foundCdx)
                                {
                                    if (label == CDX)
                                    {
                                        foundCdx = true;
                                    }
                                }
                                else
                                {
                                    allSetters.TryGetValue(label, out var setter);
                                    fieldSetters.Add(setter);
                                }
                            }
                        }

                        Utf8String[] arr = null;
                        while (!reader.IsCompleted)
                        {
                            var line = reader.ReadLine();
                            if (line.Length == 0)
                            {
                                continue;
                            }

                            line.Split((byte)' ', StringSplitOptions.None, ref arr);


                            var item = new WarcCdxItemRaw();
                            for (int i = 0; i < fieldSetters.Count; i++)
                            {
                                var value = arr[i];
                                if (value.Length > 0 & !(value.Length == 1 && value[0] == (byte)'-'))
                                {
                                    fieldSetters[i]?.Invoke(item, value);
                                }
                            }
                            yield return(item);
                            //action(item);
                        }
                    }

                    if (textStream == cdxStream)
                    {
                        break;
                    }
                    var remaining = GetRemainingUnusedBytes((GZipStream)textStream);
                    textStream.Dispose();
                    if (remaining == 0)
                    {
                        break;
                    }
                    cdxStream.Position -= remaining;
                }
            }
        }
Exemple #2
0
        public static void GenerateCdx(string cdx, IEnumerable <string> warcs)
        {
            var scratchpad = new Scratchpad();
            var buf        = new byte[16 * 1024];

            using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read))
            {
                using (var writer = new Utf8StreamWriter(output))
                {
                    writer.WriteClrStringLine(WarcColumns);
                    foreach (var warc in warcs)
                    {
                        Console.WriteLine(Path.GetFileName(warc));

                        using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read))
                        {
                            try
                            {
                                var warcname = new Utf8String(Path.GetFileName(warc));
                                while (warcStream.Position != warcStream.Length)
                                {
                                    var  startPosition     = warcStream.Position;
                                    long warcContentLength = -1;
                                    long payloadLength     = -1;
                                    int  responseCode      = -1;
                                    var  contentType       = Utf8String.Empty;
                                    var  date = scratchpad.Use(14);
                                    date[0] = 0;
                                    Utf8String url          = Utf8String.Empty;
                                    Utf8String shamanUrl    = Utf8String.Empty;
                                    DateTime?  lastModified = null;
                                    bool       isresponse   = false;
                                    using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true))
                                    {
                                        using (var reader = new Utf8StreamReader(gz, true))
                                        {
                                            while (true)
                                            {
                                                if (reader.IsCompleted)
                                                {
                                                    throw new EndOfStreamException();
                                                }
                                                var line = reader.ReadLine();
                                                if (line.Length == 0)
                                                {
                                                    break;
                                                }
                                                if (line.Equals(Warc_Response))
                                                {
                                                    isresponse = true;
                                                }
                                                if (line.StartsWith(Warc_ContentLength))
                                                {
                                                    warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line));
                                                }
                                                else if (line.StartsWith(Warc_Date))
                                                {
                                                    var val = WarcItem.GetHeaderValue(line).Bytes;
                                                    val.Slice(0, 4).CopyTo(date.Slice(0));
                                                    val.Slice(5, 2).CopyTo(date.Slice(4));
                                                    val.Slice(8, 2).CopyTo(date.Slice(6));
                                                    val.Slice(11, 2).CopyTo(date.Slice(8));
                                                    val.Slice(14, 2).CopyTo(date.Slice(10));
                                                    val.Slice(17, 2).CopyTo(date.Slice(12));
                                                }
                                                else if (line.StartsWith(Warc_URL))
                                                {
                                                    url = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                                if (line.StartsWith(Warc_Shaman_URI))
                                                {
                                                    shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                            }
                                            if (warcContentLength == -1)
                                            {
                                                throw new InvalidOperationException();
                                            }


                                            if (isresponse)
                                            {
                                                using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null))
                                                {
                                                    long l = 0;
                                                    while (true)
                                                    {
                                                        var m = s.Read(buf, 0, buf.Length);
                                                        if (m == 0)
                                                        {
                                                            break;
                                                        }
                                                        l += m;
                                                    }
                                                    payloadLength = l;
                                                    if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength)
                                                    {
                                                        throw new Exception("Content-Length mismatch.");
                                                    }
                                                }
                                                //var httpData = new LimitedStream(reader, contentLength);
                                                var cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                var lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }


                                                cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                //if (reader.ReadByte() != 13) throw new Exception();
                                                //if (reader.ReadByte() != 10) throw new Exception();
                                            }
                                            else
                                            {
                                                var remaining = warcContentLength;
                                                while (remaining != 0)
                                                {
                                                    var m = reader.Read((int)Math.Min(remaining, int.MaxValue));
                                                    if (m.Count == 0)
                                                    {
                                                        throw new Exception();
                                                    }
                                                    remaining -= m.Count;
                                                }

                                                var e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (!reader.IsCompleted)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                            }

                                            //var r = reader.RemainingBufferedData;
                                            var end = reader.ReadByte();
                                            if (end != -1)
                                            {
                                                throw new InvalidDataException();
                                            }
                                            //Console.WriteLine($"Remaining: {r.Length}");
                                        }



                                        warcStream.Position -= GetRemainingUnusedBytes(gz);
                                    }



                                    if (isresponse)
                                    {
                                        if (shamanUrl.Length > 0)
                                        {
                                            writer.Write(shamanUrl);
                                        }
                                        else
                                        {
                                            writer.Write(url);
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(startPosition);
                                        writer.Write((byte)' ');
                                        writer.Write(warcStream.Position - startPosition);
                                        writer.Write((byte)' ');
                                        if (date[0] != 0)
                                        {
                                            writer.Write(date);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(warcname);
                                        writer.Write((byte)' ');
                                        if (responseCode != -1)
                                        {
                                            writer.Write(responseCode);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(contentType);
                                        writer.Write((byte)' ');
                                        writer.Write(payloadLength);
                                        writer.Write((byte)' ');
                                        if (lastModified != null)
                                        {
                                            WriteDate(writer, lastModified.Value);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');

                                        writer.WriteLine();
                                    }
                                    scratchpad.Reset();
                                }
                            }
                            catch
                            {
                                if (warcStream.Position == warcStream.Length)
                                {
                                    Console.WriteLine("WARNING: truncated WARC.");;
                                }
                                else
                                {
                                    throw;
                                }
                            }
                        }
                    }
                }
            }
            File.Delete(cdx);
            File.Move(cdx + ".tmp", cdx);
        }
Exemple #3
0
        public static Stream OpenHttp(Utf8StreamReader httpReader, Scratchpad scratchpad, Uri requestedUrl, long responseLength, out long payloadLength, out Uri location, out int responseCode, out Utf8String contentType, out DateTime?lastModified, Action <Utf8String, Utf8String> onHttpHeader)
        {
            var startPosition = httpReader.Position;

            payloadLength = -1;
            location      = null;
            lastModified  = null;

            bool chunked      = false;
            bool gzipped      = false;
            bool brotli       = false;
            var  responseLine = httpReader.ReadLine();

            responseCode = (int)Utf8Utils.ParseInt64(responseLine.TryCaptureBetween((byte)' ', (byte)' ') ?? responseLine.CaptureAfter((byte)' '));
            while (true)
            {
                var line = httpReader.ReadLine();
                if (httpReader.IsCompleted)
                {
                    throw new InvalidDataException();
                }
                if (line.Length == 0)
                {
                    break;
                }
                if (onHttpHeader != null)
                {
                    var d = line.IndexOf((byte)':');
                    onHttpHeader(line.Substring(0, d).Trim(), line.Substring(d + 1).Trim());
                }
                if (line.StartsWith(Http_TransferEncoding))
                {
                    var value = GetHeaderValue(line);
                    if (value.Equals("chunked"))
                    {
                        chunked = true;
                    }
                }
                else if (line.StartsWith(Http_ContentLength))
                {
                    payloadLength = Utf8Utils.ParseInt64(GetHeaderValue(line));
                }
                else if (line.StartsWith(Http_ContentEncoding))
                {
                    var value = GetHeaderValue(line);
                    if (value == Http_Gzip)
                    {
                        gzipped = true;
                    }
                    else if (value == Http_Brotli)
                    {
                        brotli = true;
                    }
                }
                else if (line.StartsWith(Http_Location))
                {
                    var val = GetHeaderValue(line).ToString();
                    try
                    {
                        if (val.StartsWith("//"))
                        {
                            location = new Uri(requestedUrl.Scheme + ":" + val);
                        }
                        else
                        {
                            location = new Uri(requestedUrl, val);
                        }
                    }
                    catch (Exception ex)
                    {
                    }
                }
                else if (line.StartsWith(Http_ContentType) && scratchpad != null)
                {
                    var value = GetHeaderValue(line);
                    value       = value.TryCaptureBefore((byte)' ') ?? value;
                    value       = value.TryCaptureBefore((byte)';') ?? value;
                    contentType = scratchpad.Copy(value);
                }
                else if (line.StartsWith(Http_LastModified))
                {
                    try
                    {
                        lastModified = WarcCdxItemRaw.ParseHttpDate(GetHeaderValue(line));
                    }
                    catch { }
                }
            }

            var compressed = gzipped || brotli;

            if (compressed || chunked)
            {
                payloadLength = -1;
            }
            Stream s;

            if (responseLength != -1)
            {
                var currentPos     = httpReader.Position - startPosition;
                var httpBodyLength = responseLength - currentPos;

                if (!compressed && !chunked && payloadLength != -1 && httpBodyLength != payloadLength)
                {
                    throw new Exception("Unexpected Content-Length.");
                }
                s = new LimitedStream(httpReader, httpBodyLength);
            }
            else
            {
                s = httpReader;
            }
            if (chunked)
            {
                s = new ChunkedStream(s);
            }
            if (compressed && chunked)
            {
                s = new OnDisposeConsumeStream(s);
            }

            if (gzipped)
            {
                s = new GZipStream(s, CompressionMode.Decompress);
            }
            else if (brotli)
            {
                s = new BrotliStream(s, CompressionMode.Decompress);
            }

            return(s);
        }