Exemplo n.º 1
0
 public void EndRecord()
 {
     currentRecord.WriteLine();
     currentRecord.WriteLine();
     currentRecord.Dispose();
     currentRecord = null;
 }
Exemplo n.º 2
0
 private static void WriteTwoDigitValue(Utf8StreamWriter writer, int num)
 {
     if (num < 10)
     {
         writer.Write((byte)'0');
     }
     writer.Write(num);
 }
Exemplo n.º 3
0
 private static void WriteDate(Utf8StreamWriter writer, DateTime date)
 {
     writer.Write(date.Year);
     WriteTwoDigitValue(writer, date.Month);
     WriteTwoDigitValue(writer, date.Day);
     WriteTwoDigitValue(writer, date.Hour);
     WriteTwoDigitValue(writer, date.Minute);
     WriteTwoDigitValue(writer, date.Second);
 }
Exemplo n.º 4
0
 public void StartRecord()
 {
     if (currentRecord != null)
     {
         throw new InvalidOperationException();
     }
     if (UseGzip)
     {
         var gz = new GZipStream(outstream, CompressionMode.Compress, true);
         currentRecord = new Utf8StreamWriter(gz, false);
     }
     else
     {
         currentRecord = new Utf8StreamWriter(outstream, true);
     }
 }
Exemplo n.º 5
0
        public void WriteWarcInfo()
        {
            StartRecord();



            ms.SetLength(0);
            using (var warcinfo = new Utf8StreamWriter(ms, true))
            {
                warcinfo.WriteClrStringLine("Software: Shaman.IO/1.1");
                warcinfo.WriteClrStringLine("Format: WARC File Format 1.0");
                warcinfo.WriteClrStringLine("Conformsto: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
                warcinfo.WriteClrStringLine("Robots: off");
                warcinfo.WriteLine();
            }

            currentRecord.WriteClrStringLine("WARC/1.0");
            currentRecord.WriteClrStringLine("WARC-Type: warcinfo");
            currentRecord.WriteClrStringLine("Content-Type: application/warc-fields");

            currentRecord.WriteClrString("WARC-Date: ");
            currentRecord.WriteClrString(DateTime.UtcNow.ToString("o").Substring(0, 19));
            currentRecord.WriteClrStringLine("Z");

            WarcInfoId = Guid.NewGuid().ToString();


            currentRecord.WriteClrString("WARC-Record-ID: <urn:uuid:");
            currentRecord.WriteClrString(WarcInfoId);
            currentRecord.WriteClrStringLine(">");

            currentRecord.WriteClrString("Content-Length: ");
            currentRecord.Write(ms.Length);
            currentRecord.WriteLine();


            currentRecord.WriteClrString("WARC-Warcinfo-ID: <urn:uuid:");
            currentRecord.WriteClrString(WarcInfoId);
            currentRecord.WriteClrStringLine(">");

            currentRecord.WriteLine();

            ms.TryGetBuffer(out var buf);
            currentRecord.Write(buf.Array.Slice(buf.Offset, (int)ms.Length));
            EndRecord();
        }
Exemplo n.º 6
0
        public static void SerializeToXml <T>(T t, string outFilename, bool inOmitXmlDeclaration = false, XmlSerializerNamespaces inNameSpaces = null, Encoding inEncoding = null)
        {
            MakeDirectoryPath(outFilename);
            var ns = inNameSpaces;

            if (ns == null)
            {
                ns = new XmlSerializerNamespaces();
                ns.Add("", "");
            }
            var serializer = new XmlSerializer(t.GetType());
            var textWriter = (TextWriter) new StreamWriter(outFilename);

            if (inEncoding != null && inEncoding.Equals(Encoding.UTF8))
            {
                textWriter = new Utf8StreamWriter(outFilename);
            }
            var xmlWriter = XmlWriter.Create(textWriter, new XmlWriterSettings {
                OmitXmlDeclaration = inOmitXmlDeclaration
            });

            serializer.Serialize(xmlWriter, t, ns);
            textWriter.Close();
        }
Exemplo n.º 7
0
        public static void GenerateCdx(string cdx, IEnumerable <string> warcs)
        {
            var scratchpad = new Scratchpad();
            var buf        = new byte[16 * 1024];

            using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read))
            {
                using (var writer = new Utf8StreamWriter(output))
                {
                    writer.WriteClrStringLine(WarcColumns);
                    foreach (var warc in warcs)
                    {
                        Console.WriteLine(Path.GetFileName(warc));

                        using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read))
                        {
                            try
                            {
                                var warcname = new Utf8String(Path.GetFileName(warc));
                                while (warcStream.Position != warcStream.Length)
                                {
                                    var  startPosition     = warcStream.Position;
                                    long warcContentLength = -1;
                                    long payloadLength     = -1;
                                    int  responseCode      = -1;
                                    var  contentType       = Utf8String.Empty;
                                    var  date = scratchpad.Use(14);
                                    date[0] = 0;
                                    Utf8String url          = Utf8String.Empty;
                                    Utf8String shamanUrl    = Utf8String.Empty;
                                    DateTime?  lastModified = null;
                                    bool       isresponse   = false;
                                    using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true))
                                    {
                                        using (var reader = new Utf8StreamReader(gz, true))
                                        {
                                            while (true)
                                            {
                                                if (reader.IsCompleted)
                                                {
                                                    throw new EndOfStreamException();
                                                }
                                                var line = reader.ReadLine();
                                                if (line.Length == 0)
                                                {
                                                    break;
                                                }
                                                if (line.Equals(Warc_Response))
                                                {
                                                    isresponse = true;
                                                }
                                                if (line.StartsWith(Warc_ContentLength))
                                                {
                                                    warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line));
                                                }
                                                else if (line.StartsWith(Warc_Date))
                                                {
                                                    var val = WarcItem.GetHeaderValue(line).Bytes;
                                                    val.Slice(0, 4).CopyTo(date.Slice(0));
                                                    val.Slice(5, 2).CopyTo(date.Slice(4));
                                                    val.Slice(8, 2).CopyTo(date.Slice(6));
                                                    val.Slice(11, 2).CopyTo(date.Slice(8));
                                                    val.Slice(14, 2).CopyTo(date.Slice(10));
                                                    val.Slice(17, 2).CopyTo(date.Slice(12));
                                                }
                                                else if (line.StartsWith(Warc_URL))
                                                {
                                                    url = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                                if (line.StartsWith(Warc_Shaman_URI))
                                                {
                                                    shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                            }
                                            if (warcContentLength == -1)
                                            {
                                                throw new InvalidOperationException();
                                            }


                                            if (isresponse)
                                            {
                                                using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null))
                                                {
                                                    long l = 0;
                                                    while (true)
                                                    {
                                                        var m = s.Read(buf, 0, buf.Length);
                                                        if (m == 0)
                                                        {
                                                            break;
                                                        }
                                                        l += m;
                                                    }
                                                    payloadLength = l;
                                                    if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength)
                                                    {
                                                        throw new Exception("Content-Length mismatch.");
                                                    }
                                                }
                                                //var httpData = new LimitedStream(reader, contentLength);
                                                var cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                var lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }


                                                cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                //if (reader.ReadByte() != 13) throw new Exception();
                                                //if (reader.ReadByte() != 10) throw new Exception();
                                            }
                                            else
                                            {
                                                var remaining = warcContentLength;
                                                while (remaining != 0)
                                                {
                                                    var m = reader.Read((int)Math.Min(remaining, int.MaxValue));
                                                    if (m.Count == 0)
                                                    {
                                                        throw new Exception();
                                                    }
                                                    remaining -= m.Count;
                                                }

                                                var e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (!reader.IsCompleted)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                            }

                                            //var r = reader.RemainingBufferedData;
                                            var end = reader.ReadByte();
                                            if (end != -1)
                                            {
                                                throw new InvalidDataException();
                                            }
                                            //Console.WriteLine($"Remaining: {r.Length}");
                                        }



                                        warcStream.Position -= GetRemainingUnusedBytes(gz);
                                    }



                                    if (isresponse)
                                    {
                                        if (shamanUrl.Length > 0)
                                        {
                                            writer.Write(shamanUrl);
                                        }
                                        else
                                        {
                                            writer.Write(url);
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(startPosition);
                                        writer.Write((byte)' ');
                                        writer.Write(warcStream.Position - startPosition);
                                        writer.Write((byte)' ');
                                        if (date[0] != 0)
                                        {
                                            writer.Write(date);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(warcname);
                                        writer.Write((byte)' ');
                                        if (responseCode != -1)
                                        {
                                            writer.Write(responseCode);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(contentType);
                                        writer.Write((byte)' ');
                                        writer.Write(payloadLength);
                                        writer.Write((byte)' ');
                                        if (lastModified != null)
                                        {
                                            WriteDate(writer, lastModified.Value);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');

                                        writer.WriteLine();
                                    }
                                    scratchpad.Reset();
                                }
                            }
                            catch
                            {
                                if (warcStream.Position == warcStream.Length)
                                {
                                    Console.WriteLine("WARNING: truncated WARC.");;
                                }
                                else
                                {
                                    throw;
                                }
                            }
                        }
                    }
                }
            }
            File.Delete(cdx);
            File.Move(cdx + ".tmp", cdx);
        }