Пример #1
0
        public override void Scan()
        {
            target.Position = 0;
            StreamBrake         brake   = new StreamBrake(target); //GZIP Streams need to be wrapped, otherwise they won't play nicely...
            long                counter = 0;
            string              warcVersion;
            string              temp      = "";
            List <WarcRequest>  requests  = new List <WarcRequest>();
            List <WarcResponse> responses = new List <WarcResponse>();

            Debug.Write("Parsing WARC..");
            while (target.Position != target.Length)
            {
                if (counter++ % 2000 == 0)
                {
                    Debug.Write(".");
                }

                WarcInfo     info     = null;
                GzipLocation location = new GzipLocation(target, brake.Position);
                GZipStream   gzStream = new GZipStream(brake, CompressionMode.Decompress);
                StreamReader str      = new StreamReader(gzStream, Encoding.ASCII);
                IIndexer.HandleEntry(str, location, requests, responses);


                while (str.ReadLine() != null)
                {
                    ;
                }
            }

            WarcRequests  = requests;
            WarcResponses = responses;
        }
Пример #2
0
        protected static void HandleEntry(StreamReader str, GzipLocation location, List <WarcRequest> requests, List <WarcResponse> responses)
        {
            WarcInfo info = null;
            string   warcVersion;

            warcVersion = str.ReadLine();
            if (!warcVersion.StartsWith("WARC"))
            {
                throw new InvalidDataException("doesn't look like a WARC file to me.");
            }
            string type = str.ReadLine().Split(' ')[1];

            switch (type)
            {
            case "warcinfo":
                if (info != null)
                {
                    throw new Exception("Sorry, megawarcs are not yet supported.");
                }
                info = new WarcInfo(str);
                info.GzipLocation = location;
                break;

            case "request":
                WarcRequest requ = new WarcRequest(str);
                requ.GzipLocation = location;
                requests.Add(requ);
                break;

            case "response":
                WarcResponse resp = new WarcResponse(str);
                resp.GzipLocation = location;
                responses.Add(resp);
                break;

            case "metadata":
                new WarcMetadata(str);     //I don't care about metadata, so I'll just discard these.
                break;

            case "resource":
                new WarcResource(str);     //The same applies to resources.
                break;

            default:
                throw new NotImplementedException(string.Format("Don't know what {0} means.", type));
            }
        }
Пример #3
0
        public override void Scan()
        {
            List <WarcRequest>  requests  = new List <WarcRequest>();
            List <WarcResponse> responses = new List <WarcResponse>();

            parentStream.Position = 0;
            GzipLocation location;

            while (parentStream.Position != parentStream.Length)
            {
                location = new GzipLocation(parentStream, parentStream.Position);

                BinaryReader br = new BinaryReader(parentStream);
                if (br.ReadUInt16() != GzipUtils.GzipMagicNumber)
                {
                    throw new InvalidDataException("GZIP Header not found.");
                }
                method = (CompressionMethod)br.ReadByte();
                byte mask = br.ReadByte();
                isText                    = (mask & 0x01) != 0;
                hasChecksum               = (mask & 0x02) != 0;
                hasExtraFields            = (mask & 0x04) != 0;
                hasOriginalFileNameString = (mask & 0x08) != 0;
                containsComment           = (mask & 0x10) != 0;
                lastModificationTime      = br.ReadUInt32();
                extraFlags                = br.ReadByte();
                operatingSystem           = (GzipOperatingSystem)br.ReadByte();
                if (hasExtraFields)
                {
                    extraSize       = br.ReadUInt16();
                    extraDataOffset = br.BaseStream.Position;
                    if (extraSize != 12)
                    {
                        throw new InvalidDataException("That doesn't look like a WARC file, because the Gzip Extra Data Length seems uncommon.");
                    }
                    extraData = new WARC_EXTRA_DATA(br.ReadBytes(extraSize));
                }
                if (hasOriginalFileNameString)
                {
                    originalFilename = readNullTerminatedString(br);
                }
                if (containsComment)
                {
                    comment = readNullTerminatedString(br);
                }
                if (hasChecksum)
                {
                    crc = br.ReadUInt16();
                }

                DeflateStream ds = new DeflateStream(parentStream, CompressionMode.Decompress);
                StreamReader  sr = new StreamReader(ds, Encoding.ASCII);
                IIndexer.HandleEntry(sr, location, requests, responses);

                br.BaseStream.Position = location.offset + (CompressedSize - 8);
                crc32            = br.ReadUInt32();
                uncompressedSize = br.ReadUInt32();
            }

            WarcResponses = responses;
            WarcRequests  = requests;
        }