protected override void ConsumeData(IDataProducer sender, object data) { DocumentCorpus c = (DocumentCorpus)data; DataTable dt = CreateTable(); foreach (Document doc in c.Documents) { Document d = doc.Clone(); string rawHtml = d.Features.GetFeatureValue("raw"); DateTime time = DateTime.Parse(d.Features.GetFeatureValue("time")); Guid cGuid = new Guid(c.Features.GetFeatureValue("guid")); Guid dGuid = new Guid(d.Features.GetFeatureValue("guid")); ArrayList<byte> buffer = new ArrayList<byte>(); buffer.AddRange(cGuid.ToByteArray()); buffer.AddRange(dGuid.ToByteArray()); Guid docId = new Guid(MD5.Create().ComputeHash(buffer.ToArray())); d.Features.RemoveFeature("raw"); DateTime timeEnd = DateTime.Parse(c.Features.GetFeatureValue("timeEnd")); d.Features.SetFeatureValue("oldId", string.Format("{0:HH}_{0:mm}_{0:ss}_{1:N}_{2:N}", timeEnd, cGuid, dGuid)); d.Features.SetFeatureValue("guid", docId.ToString("N")); d.Features.SetFeatureValue("rssUrl", c.Features.GetFeatureValue("sourceUrl")); d.Features.SetFeatureValue("siteId", c.Features.GetFeatureValue("siteId")); // remove boilerplate removal features foreach (Annotation annot in d.Annotations) { if (annot.Type.StartsWith("TextBlock")) { annot.Features.Clear(); } } // write doc XML if (mXmlDataRoot != null) { string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.xml.gz", mXmlDataRoot, time, docId); string path = new FileInfo(outFileName).DirectoryName; if (!Directory.Exists(path)) { lock (mLock) { if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } } } d.WriteXmlCompressed(outFileName); } // write raw HTML if (mHtmlDataRoot != null) { string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.html.gz", mHtmlDataRoot, time, docId); string path = new FileInfo(outFileName).DirectoryName; if (!Directory.Exists(path)) { lock (mLock) { if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } } } using (FileStream stream = new FileStream(outFileName, FileMode.Create)) { using (GZipStream gzStream = new GZipStream(stream, CompressionMode.Compress)) { using (BinaryWriter w = new BinaryWriter(gzStream)) { w.Write(Convert.FromBase64String(rawHtml)); } } } } // prepare for bulk write if (mConnectionString != null) { string fileName = string.Format("{0:yyyy}\\{0:MM}\\{0:dd}\\{0:HH}_{0:mm}_{0:ss}_{1:N}.xml.gz", time, docId); dt.Rows.Add( new Guid(d.Features.GetFeatureValue("guid")), Utils.Truncate(d.Name, 400), Utils.Truncate(d.Features.GetFeatureValue("description"), 400), Utils.Truncate(d.Features.GetFeatureValue("category"), 400), Utils.Truncate(d.Features.GetFeatureValue("link"), 400), Utils.Truncate(d.Features.GetFeatureValue("responseUrl"), 400), Utils.Truncate(d.Features.GetFeatureValue("urlKey"), 400), DateTime.Parse(d.Features.GetFeatureValue("time")), Utils.Truncate(d.Features.GetFeatureValue("pubDate"), 100), Utils.Truncate(d.Features.GetFeatureValue("mimeType"), 80), Utils.Truncate(d.Features.GetFeatureValue("charSet"), 40), Convert.ToInt32(d.Features.GetFeatureValue("contentLength")), Utils.Truncate(d.Features.GetFeatureValue("domainName"), 100), Convert.ToInt32(d.Features.GetFeatureValue("bprBoilerplateCharCount")), Convert.ToInt32(d.Features.GetFeatureValue("bprContentCharCount")), Convert.ToInt32(d.Features.GetFeatureValue("unseenContentCharCount")), Convert.ToInt32(d.Features.GetFeatureValue("rev")), Utils.Truncate(fileName, 100), cGuid, dGuid ); } } // bulk write to database if (mConnectionString != null && dt.Rows.Count > 0) { using (SqlConnection connection = new SqlConnection(mConnectionString)) { connection.Open(); using (SqlBulkCopy bulkWriter = new SqlBulkCopy(connection)) { bulkWriter.BulkCopyTimeout = mCommandTimeout; bulkWriter.DestinationTableName = "Documents"; bulkWriter.WriteToServerRetryOnDeadlock(dt); } } } }