示例#1
0
        public override void Scan()
        {
            target.Position = 0;
            StreamBrake         brake   = new StreamBrake(target); //GZIP Streams need to be wrapped, otherwise they won't play nicely...
            long                counter = 0;
            string              warcVersion;
            string              temp      = "";
            List <WarcRequest>  requests  = new List <WarcRequest>();
            List <WarcResponse> responses = new List <WarcResponse>();

            Debug.Write("Parsing WARC..");
            while (target.Position != target.Length)
            {
                if (counter++ % 2000 == 0)
                {
                    Debug.Write(".");
                }

                WarcInfo     info     = null;
                GzipLocation location = new GzipLocation(target, brake.Position);
                GZipStream   gzStream = new GZipStream(brake, CompressionMode.Decompress);
                StreamReader str      = new StreamReader(gzStream, Encoding.ASCII);
                IIndexer.HandleEntry(str, location, requests, responses);


                while (str.ReadLine() != null)
                {
                    ;
                }
            }

            WarcRequests  = requests;
            WarcResponses = responses;
        }
示例#2
0
        protected static void HandleEntry(StreamReader str, GzipLocation location, List <WarcRequest> requests, List <WarcResponse> responses)
        {
            WarcInfo info = null;
            string   warcVersion;

            warcVersion = str.ReadLine();
            if (!warcVersion.StartsWith("WARC"))
            {
                throw new InvalidDataException("doesn't look like a WARC file to me.");
            }
            string type = str.ReadLine().Split(' ')[1];

            switch (type)
            {
            case "warcinfo":
                if (info != null)
                {
                    throw new Exception("Sorry, megawarcs are not yet supported.");
                }
                info = new WarcInfo(str);
                info.GzipLocation = location;
                break;

            case "request":
                WarcRequest requ = new WarcRequest(str);
                requ.GzipLocation = location;
                requests.Add(requ);
                break;

            case "response":
                WarcResponse resp = new WarcResponse(str);
                resp.GzipLocation = location;
                responses.Add(resp);
                break;

            case "metadata":
                new WarcMetadata(str);     //I don't care about metadata, so I'll just discard these.
                break;

            case "resource":
                new WarcResource(str);     //The same applies to resources.
                break;

            default:
                throw new NotImplementedException(string.Format("Don't know what {0} means.", type));
            }
        }