protected override object ProcessData(IDataProducer sender, object data) { Utils.ThrowException(!(data is string[]) ? new ArgumentTypeException("data") : null); DateTime timeStart = DateTime.Now; DocumentCorpus corpus = new DocumentCorpus(); foreach (string line in (string[])data) { int splitIdx = line.IndexOfAny(new char[] { ' ', '\t', '\n' }); Document doc; if (!mIsNamedDoc || splitIdx < 0) { doc = new Document("", line.Trim()); } else { doc = new Document(line.Substring(0, splitIdx).Trim(), line.Substring(splitIdx).Trim()); } doc.Features.SetFeatureValue("_time", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); corpus.AddDocument(doc); } corpus.Features.SetFeatureValue("_provider", GetType().ToString()); corpus.Features.SetFeatureValue("_isNamedDoc", mIsNamedDoc.ToString()); corpus.Features.SetFeatureValue("_timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE)); corpus.Features.SetFeatureValue("_timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); return corpus; }
public static List<String> convertFIRSTXMLtoGATE(String file) { List<String> documents = new List<string>(); // conversion from FIRST XML to GATE XML // load FIRST XML corpus DocumentCorpus corpus = new DocumentCorpus(); corpus.ReadXml(new XmlTextReader(new StreamReader(file))); // save documents as GATE XML XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; xmlSettings.Encoding = Encoding.UTF8; //int i = 0; foreach (Document doc in corpus.Documents) { StringBuilder docXML = new StringBuilder(); XmlWriter writer = XmlWriter.Create(docXML, xmlSettings); doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true); //doc.WriteXml(writer, /*writeTopElement=*/true); String docstr = docXML.ToString(); documents.Add(docstr); writer.Close(); } return documents; }
static void Main(string[] args) { // conversion from FIRST XML to GATE XML // load FIRST XML corpus DocumentCorpus corpus = new DocumentCorpus(); corpus.ReadXml(new XmlTextReader(new StreamReader(@"D:\streamer\files\23_55_08_450f24c0969d49d2883fc17a6f4e2af0.xml"))); // save documents as GATE XML XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; xmlSettings.Encoding = Encoding.UTF8; int i = 0; foreach (Document doc in corpus.Documents) { StreamWriter streamWriter = new StreamWriter(string.Format(@"D:\streamer\output\{0}.xml", ++i)); XmlWriter writer = XmlWriter.Create(streamWriter, xmlSettings); doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true); //doc.WriteXml(writer, /*writeTopElement=*/true); String docstr = writer.ToString(); writer.Close(); streamWriter.Close(); } Console.ReadKey(); }
public override void Start() { if (!IsRunning) { mThread = new Thread(new ThreadStart( delegate() { while (!mStopped && !mMessenger.isMessagingFinished()) { string message = mMessenger.getMessage(); if (message != null) { try { DocumentCorpus dc = new DocumentCorpus(); XmlReader reader = new XmlTextReader(new StringReader(message)); dc.ReadXml(reader); reader.Close(); DispatchData(dc); } catch (Exception e) { mLogger.Error("ZeroMqReceiverComponent", e); //File.WriteAllText(@"C:\Users\Administrator\Desktop\err\" + Guid.NewGuid().ToString("N") + ".xml", message, Encoding.UTF8); } } Thread.Sleep(1); } } )); mStopped = false; mThread.Start(); } }
static void Main(string[] args) { Console.WriteLine("hello worlds!"); //X x = new X(); //A a = new A(); //B b = new B(); //C c = new C(); //D d = new D(); //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer(); //gsdc.OnConsumeData += delegate(IDataProducer sender, object data) //{ // Console.WriteLine((string)data); //}; //Y y = new Y(); //x.Subscribe(a); //a.Subscribe(b); //b.Subscribe(gsdc); //x.Subscribe(c); //c.Subscribe(d); //d.Subscribe(gsdc); //x.Start(); //Console.ReadLine(); //Console.WriteLine("stop"); //x.GracefulStop(); //Console.ReadLine(); //DocumentCorpus corpus = new DocumentCorpus(); //Document doc = new Document("This is a very short document. This is some boilerplate."); //corpus.Add(doc); //Annotation annot = new Annotation(0, 29, "content_block"); ////doc.AddAnnotation(annot); //RegexTokenizerComponent tok = new RegexTokenizerComponent(); //tok.ReceiveData(null, corpus); //Regex mCharsetRegex // = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase); //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success); //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines"); //rss.Start(); Document doc = new Document("name", "bla bla"); Document doc2 = new Document("name2", "bla bla 2"); doc.AddAnnotation(new Annotation(0, 100, "waka waka")); StringWriter sw; XmlTextWriter writer = new XmlTextWriter(sw = new StringWriter()); DocumentCorpus c = new DocumentCorpus(); c.AddDocument(doc); c.AddDocument(doc2); c.WriteXml(writer); Console.WriteLine(sw); }
private bool SendDocumentCorpusInfo(DocumentCorpus corpus) { // taken from Latino.Web WebUtils.cs HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://first-vm4.ijs.si/feed-form/"); request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6"; request.Accept = "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,*/*;q=0.5"; request.Headers.Add("Accept-Language", "en-us,en;q=0.5"); request.Headers.Add("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7"); // configure POST request request.CookieContainer = mCookies; request.Method = "POST"; StringBuilder postData = new StringBuilder(string.Format("csrfmiddlewaretoken={0}&form-TOTAL_FORMS={1}&form-INITIAL_FORMS=0", mCsrftoken, corpus.Documents.Count)); int i = 0; foreach (Document document in corpus.Documents) { string title = Utils.ToOneLine(document.Name, /*compact=*/ true); TextBlock[] textBlocks = document.GetAnnotatedBlocks("TextBlock/Content"); StringBuilder text = new StringBuilder(); foreach (TextBlock textBlock in textBlocks) { if (IsSubstring(textBlock.Text, title) < 0.2) { text.AppendLine(textBlock.Text); if (text.Length > 600) { break; } } } //&form-0-url=...&form-0-title=...&form-0-source=...&form-0-snippet=...&form-0-timestamp=... string docData = string.Format("&form-{5}-url={0}&form-{5}-title={1}&form-{5}-source={2}&form-{5}-snippet={3}&form-{5}-timestamp={4}", HttpUtility.UrlEncode(document.Features.GetFeatureValue("responseUrl")), HttpUtility.UrlEncode(HttpUtility.HtmlEncode(title)), HttpUtility.UrlEncode(corpus.Features.GetFeatureValue("siteId")), HttpUtility.UrlEncode(HttpUtility.HtmlEncode(Utils.ToOneLine(Utils.Truncate(text.ToString(), 500), /*compact=*/ true)) + " ..."), "2012-04-13+16%3A47%3A38", i++); postData.Append(docData); } //Console.WriteLine(postData.ToString()); byte[] buffer = Encoding.ASCII.GetBytes(postData.ToString()); request.ContentLength = buffer.Length; request.ContentType = "application/x-www-form-urlencoded"; Stream dataStream = request.GetRequestStream(); dataStream.Write(buffer, 0, buffer.Length); dataStream.Close(); // send request try { request.GetResponse().Close(); return(true); } catch { return(false); } }
protected override void ConsumeData(IDataProducer sender, object data) { Utils.ThrowException(!(data is DocumentCorpus) ? new ArgumentTypeException("data") : null); DocumentCorpus corpus = (DocumentCorpus)data; if (mCsrftoken == null || !SendDocumentCorpusInfo(corpus)) { GetDjangoCookie(); SendDocumentCorpusInfo(corpus); } }
public /*protected*/ override object ProcessData(IDataProducer sender, object data) { try { DocumentCorpus corpus = (DocumentCorpus)data; DocumentCorpus filteredCorpus = new DocumentCorpus(); DocumentCorpus dumpCorpus = new DocumentCorpus(); filteredCorpus.CopyFeaturesFrom(corpus); dumpCorpus.CopyFeaturesFrom(corpus); ArrayList <Document> dumpDocumentList = new ArrayList <Document>(); foreach (Document document in corpus.Documents) { try { if (OnFilterDocument != null) { if (!OnFilterDocument(document, mLogger)) { dumpDocumentList.Add(document); } } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } } foreach (Document doc in dumpDocumentList) { corpus.Remove(doc); dumpCorpus.AddDocument(doc); } if (dumpCorpus.Documents.Count > 0) { WorkflowUtils.DispatchData(this, dumpCorpus, mCloneDumpOnFork, mDumpDispatchPolicy, mDumpDataConsumers, mLogger); } return(corpus.Documents.Count > 0 ? corpus : null); } catch (Exception exception) { mLogger.Error("ProcessData", exception); return(data); } }
protected override void ConsumeData(IDataProducer sender, object data) { DocumentCorpus c = (DocumentCorpus)data; foreach (Document doc in c.Documents) { short sentenceNum = 0, blockNum = 0; int tokensPerDocument = 0; //string documentId = doc.Features.GetFeatureValue("guid"); //documentId = documentId.Replace("-", ""); //doc.Features.SetFeatureValue("fullId", corpusId + "_" + documentId); //add feature fullId for Achim string responseUrl = doc.Features.GetFeatureValue("responseUrl") ?? ""; string urlKey = doc.Features.GetFeatureValue("urlKey") ?? ""; string title = doc.Features.GetFeatureValue("title") ?? ""; string domainName = doc.Features.GetFeatureValue("domainName") ?? ""; //********************* date = pubdate if |pubDate-timeGet|<3 days string pubDate = doc.Features.GetFeatureValue("pubDate") ?? ""; DateTime timeGet = DateTime.Parse(doc.Features.GetFeatureValue("time")); string date = timeGet.ToString("yyyy-MM-dd"); try { DateTime mPubDate = DateTime.Parse(pubDate); if (DateTime.Compare(mPubDate, timeGet) < 0 && timeGet.Subtract(mPubDate).CompareTo(TimeSpan.FromDays(3)) < 0) { date = mPubDate.ToString("yyyy-MM-dd"); } } catch { } // supress errors //******************* Document to database double pumpDumpIndex = Convert.ToDouble(doc.Features.GetFeatureValue("pumpIndex")); bool isFinancial = doc.Features.GetFeatureValue("isFinancial") == "True"; // compute new ID Guid cGuid = new Guid(c.Features.GetFeatureValue("guid")); Guid dGuid = new Guid(doc.Features.GetFeatureValue("guid")); ArrayList <byte> buffer = new ArrayList <byte>(); buffer.AddRange(cGuid.ToByteArray()); buffer.AddRange(dGuid.ToByteArray()); Guid documentId = new Guid(MD5.Create().ComputeHash(buffer.ToArray())); long docId = ToDb.DocumentToDb(mConnection, title, date, pubDate, timeGet.ToString("yyyy-MM-dd HH:mm"), responseUrl, urlKey, domainName, isFinancial, pumpDumpIndex, documentId); //******************* occurrences string blockSelector = "TextBlock/Content"; string rev = doc.Features.GetFeatureValue("rev"); if (rev != "1") { blockSelector = "TextBlock/Content/Unseen"; } //******************** occurrence to database int documentNeg = 0, documentPoz = 0; doc.CreateAnnotationIndex(); foreach (TextBlock tb in doc.GetAnnotatedBlocks(blockSelector)) //"TextBlock/Content" if rev = "1", else "TextBlock/Content/Unseen" { int tokensPerBlock = doc.GetAnnotatedBlocks("Token", tb.SpanStart, tb.SpanEnd).Length; tokensPerDocument += tokensPerBlock; int blockNeg = 0, blockPoz = 0; blockNum++; foreach (TextBlock s in doc.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector within TextBlock tb { int sentenceNeg = 0; int sentencePoz = 0; sentenceNum++; int tokensPerSentence = doc.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd).Length; // sentiment object foreach (TextBlock so in doc.GetAnnotatedBlocks("SentimentObject", s.SpanStart, s.SpanEnd)) // *** SentimentObject selector within sentence s { Annotation annot = so.Annotation; // string gazUri = annot.Features.GetFeatureValue("gazetteerUri"); string instUri = annot.Features.GetFeatureValue("instanceUri"); // string instClassUri = annot.Features.GetFeatureValue("instanceClassUri"); string term = so.Text; // takole pa dobis dejanski tekst... // Console.WriteLine("\n" + gazUri + " \t" + instUri + " \t" + instClassUri + " \t" + term); long occId = ToDb.OccurrenceToDb(mConnection, date, annot.SpanStart, annot.SpanEnd, sentenceNum, blockNum, docId, instUri); ToDb.TermToDb(mConnection, occId, term); } // sentiment word foreach (TextBlock so in doc.GetAnnotatedBlocks("SentimentWord", s.SpanStart, s.SpanEnd)) // *** SentimentWord selector within sentence s { Annotation annot = so.Annotation; string gazUri = annot.Features.GetFeatureValue("gazetteerUri"); string instUri = annot.Features.GetFeatureValue("instanceUri"); string instClassUri = annot.Features.GetFeatureValue("instanceClassUri"); string term = so.Text; // takole pa dobis dejanski tekst... // Console.WriteLine("\n" + gazUri + " \t" + instUri + " \t" + instClassUri + " \t" + term); if (instClassUri.EndsWith("PositiveWord")) { sentencePoz++; blockPoz++; documentPoz++; } else if (instClassUri.EndsWith("NegativeWord")) { sentenceNeg++; blockNeg++; documentNeg++; } // Insert into SQL table SentimentWordOccurrence ToDb.SentimentWordOccurrenceToDb(mConnection, date, annot.SpanStart, annot.SpanEnd, sentenceNum, blockNum, docId, instUri); } } // Insert into SQL table BlockSentiment if (blockNeg != 0 || blockPoz != 0) { ToDb.BlockSentimentToDb(mConnection, docId, blockNum, blockPoz, blockNeg, tokensPerBlock); } } } }
private void ProcessItem(Dictionary<string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml) { try { string name = ""; itemAttr.TryGetValue("title", out name); string desc = ""; itemAttr.TryGetValue("description", out desc); string pubDate = ""; itemAttr.TryGetValue("pubDate", out pubDate); Guid guid = MakeGuid(name, desc, pubDate); mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/true)); if (mDbConnectionString != null) { string xmlHash = Utils.GetHashCode128(xml).ToString("N"); string category = null; itemAttr.TryGetValue("category", out category); string entities = null; itemAttr.TryGetValue("emm:entity", out entities); using (SqlConnection connection = new SqlConnection(mDbConnectionString)) { connection.Open(); using (SqlCommand cmd = new SqlCommand("insert into Sources (siteId, docId, sourceUrl, category, entities, xmlHash) values (@siteId, @docId, @sourceUrl, @category, @entities, @xmlHash)", connection)) { WorkflowUtils.AssignParamsToCommand(cmd, "siteId", Utils.Truncate(mSiteId, 100), "docId", guid.ToString("N"), "sourceUrl", Utils.Truncate(rssXmlUrl, 400), "category", category, "entities", entities, "xmlHash", xmlHash); cmd.ExecuteNonQuery(); } using (SqlCommand cmd = new SqlCommand("insert into RssXml (hash, xml) values (@hash, @xml)", connection)) { WorkflowUtils.AssignParamsToCommand(cmd, "hash", xmlHash, "xml", xml); cmd.ExecuteNonQuery(); } } } if (!mHistory.CheckHistory(guid)) { DateTime time = DateTime.Now; string content = ""; if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "") { // get referenced Web page mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/true)); string mimeType, charSet; string responseUrl; CookieContainer cookies = null; byte[] bytes = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl); if (bytes == null) { mLogger.Info("ProcessItem", "Item rejected because of its size."); mHistory.AddToHistory(guid, mSiteId); return; } ContentType contentType = GetContentType(mimeType); if ((contentType & mContentFilter) == 0) { mLogger.Info("ProcessItem", "Item rejected because of its content type."); mHistory.AddToHistory(guid, mSiteId); return; } itemAttr.Add("responseUrl", responseUrl); itemAttr.Add("mimeType", mimeType); itemAttr.Add("contentType", contentType.ToString()); if (charSet == null) { charSet = "ISO-8859-1"; } itemAttr.Add("charSet", charSet); itemAttr.Add("contentLength", bytes.Length.ToString()); if (contentType == ContentType.Binary) { // save as base64-encoded binary data content = Convert.ToBase64String(bytes); } else { // save as text content = GetEncoding(charSet).GetString(bytes); if (mIncludeRawData) { itemAttr.Add("raw", Convert.ToBase64String(bytes)); } } Thread.Sleep(mPolitenessSleep); } if (content == "") { if (itemAttr.ContainsKey("description")) { content = itemAttr["description"]; } else if (itemAttr.ContainsKey("title")) { content = itemAttr["title"]; } } itemAttr.Add("guid", guid.ToString()); itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE)); Document document = new Document(name, content); foreach (KeyValuePair<string, string> attr in itemAttr) { document.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.AddDocument(document); mHistory.AddToHistory(guid, mSiteId); } } catch (Exception e) { mLogger.Warn("ProcessItem", e); } }
public string GetDoc(string corpusId, string docId, string format, bool rmvRaw, bool changesOnly, string corpusTime) { string dataPath = Utils.GetConfigValue("DataPath", "."); if (corpusId == null || corpusId.Replace("-", "").Length != 32) { return "*** Invalid corpus ID."; } corpusId = corpusId.Replace("-", ""); if (docId == null || docId.Replace("-", "").Length != 32) { return "*** Invalid document ID."; } docId = docId.Replace("-", ""); string[] fileNames = null; if (!string.IsNullOrEmpty(corpusTime)) { try { DateTime dt = DateTime.Parse(corpusTime); string prefix = dt.ToString("HH_mm_ss_"); string path = "\\" + dt.Year + "\\" + dt.Month + "\\" + dt.Day + "\\"; string fileName = dataPath.TrimEnd('\\') + path + prefix + corpusId + ".xml"; if (!Utils.VerifyFileNameOpen(fileName)) { return "*** Corpus not found."; } fileNames = new string[] { fileName }; } catch { return "*** Unable to parse time."; } } if (fileNames == null) { fileNames = Directory.GetFiles(dataPath, "*" + corpusId + ".xml", SearchOption.AllDirectories); } if (fileNames.Length == 0) { return "*** Corpus not found."; } DocumentCorpus corpus = new DocumentCorpus(); StreamReader reader = new StreamReader(fileNames[0]); XmlTextReader xmlReader = new XmlTextReader(reader); corpus.ReadXml(xmlReader); xmlReader.Close(); reader.Close(); Document document = null; foreach (Document doc in corpus.Documents) { if (new Guid(doc.Features.GetFeatureValue("guid")).ToString("N") == docId) { document = doc; break; } } if (document == null) { return "*** Document not found."; } if (rmvRaw) { document.Features.RemoveFeature("raw"); } string response; if (format == "html") { StringWriter writer = new StringWriter(); document.MakeHtmlPage(writer, /*inlineCss=*/true); string html = new Regex(@"<!--back_button-->.*?<!--/back_button-->").Replace(writer.ToString(), ""); response = html; } else if (format == "txt") { StringBuilder txt = new StringBuilder(); string selector = "TextBlock/Content"; if (changesOnly && document.Features.GetFeatureValue("rev") != "1") { selector = "TextBlock/Content/Unseen"; } foreach (TextBlock block in document.GetAnnotatedBlocks(selector)) { txt.AppendLine(block.Text); } response = document.Name + "\r\n\r\n" + txt.ToString(); } else { StringWriter writer = new StringWriter(); XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; XmlWriter xmlWriter = XmlWriter.Create(writer, xmlSettings); if (format == "gate_xml") { document.WriteGateXml(xmlWriter, /*writeTopElement=*/true, /*removeBoilerplate=*/true); xmlWriter.Flush(); response = writer.ToString(); } else // xml { document.WriteXml(xmlWriter, /*writeTopElement=*/true); xmlWriter.Flush(); response = writer.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-16\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>"); } xmlWriter.Close(); } return response; }
protected override object ProduceData() { // are we done? if (mCurrentDirIdx >= mDataDirs.Length) { Stop(); return(null); } // do we need to get more files? if (mFiles == null) { mFiles = Directory.GetFiles(mDataDirs[mCurrentDirIdx], "*.xml"); Array.Sort(mFiles); } // did we process all currently available files? if (mCurrentFileIdx >= mFiles.Length) { mFiles = null; mCurrentFileIdx = 0; mCurrentDirIdx++; return(null); } try { // read next file mLogger.Info("ProduceData", "Reading " + mFiles[mCurrentFileIdx] + " ..."); DocumentCorpus corpus = new DocumentCorpus(); StreamReader reader = new StreamReader(mFiles[mCurrentFileIdx]); corpus.ReadXml(new XmlTextReader(reader)); //string fileName = new FileInfo(mFiles[mCurrentFileIdx]).Name; //string corpusId = new Guid(fileName.Split('_', '.')[3]).ToString(); //corpus.Features.SetFeatureValue("guid", corpusId); reader.Close(); // refresh corpus ID (to avoid conflicts) corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString()); // remove underscores in feature names string[] tmp = new string[corpus.Features.Names.Count]; corpus.Features.Names.CopyTo(tmp, /*index=*/ 0); foreach (string featureName in tmp) { if (featureName.StartsWith("_")) { corpus.Features.SetFeatureValue(featureName.TrimStart('_'), corpus.Features.GetFeatureValue(featureName)); corpus.Features.RemoveFeature(featureName); } } foreach (Document doc in corpus.Documents) { // remove annotations doc.ClearAnnotations(); // remove underscores in feature names tmp = new string[doc.Features.Names.Count]; doc.Features.Names.CopyTo(tmp, /*index=*/ 0); foreach (string featureName in tmp) { if (featureName.StartsWith("_")) { doc.Features.SetFeatureValue(featureName.TrimStart('_'), doc.Features.GetFeatureValue(featureName)); doc.Features.RemoveFeature(featureName); } } // remove processing-specific features foreach (string featureName in new string[] { "detectedLanguage", "detectedCharRange", "bprBoilerplateCharCount", "bprContentCharCount", "domainName", "urlKey", "rev", "blacklisted" }) { doc.Features.RemoveFeature(featureName); } // if there's raw data available, reset the content string raw = doc.Features.GetFeatureValue("raw"); if (raw != null) { doc.Features.SetFeatureValue("contentType", "Html"); doc.Text = GetEncoding(doc.Features.GetFeatureValue("charSet")).GetString(Convert.FromBase64String(raw)); } } mCurrentFileIdx++; while (WorkflowUtils.GetBranchLoadMax(this) > 10) // I'm giving it all she's got, Captain! { Thread.Sleep(1000); } return(corpus); } catch (Exception e) { mCurrentFileIdx++; throw e; } }
protected override object ProduceData() { for (int i = 0; i < mSources.Count; i++) { string url = mSources[i]; int numNewItems = 0; try { DateTime timeStart = DateTime.Now; // get RSS XML string xml; try { mLogger.Info("ProduceData", "Getting RSS XML from {0} ...", url); xml = WebUtils.GetWebPageDetectEncoding(url); xml = FixXml(xml); } catch (Exception e) { mLogger.Error("ProduceData", e); return null; } Dictionary<string, string> channelAttr = new Dictionary<string, string>(); DocumentCorpus corpus = new DocumentCorpus(); corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString()); XmlTextReader reader = new XmlTextReader(new StringReader(xml)); // first pass: channel attributes ReadChannelAttributes(url, xml, timeStart, channelAttr); // second pass: items mLogger.Info("ProduceData", "Reading items ..."); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name == "item" && !reader.IsEmptyElement) { Dictionary<string, string> itemAttr = new Dictionary<string, string>(); while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "item")) { if (reader.NodeType == XmlNodeType.Element) { // handle item attributes if (mItemElements.Contains(reader.Name)) { string attrName = reader.Name; string emmTrigger = attrName == "category" ? reader.GetAttribute("emm:trigger") : null; string emmEntityId = attrName == "emm:entity" ? reader.GetAttribute("id") : null; string emmEntityName = attrName == "emm:entity" ? reader.GetAttribute("name") : null; string value = Utils.XmlReadValue(reader, attrName); if (value.Trim() != "") { string oldValue; if (attrName == "pubDate") { string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null) { value = tmp; } } if (emmTrigger != null) { value += " ; " + emmTrigger.Replace(';', ',').TrimEnd(' ', ','); } if (emmEntityId != null && emmEntityName != null) { value = string.Format("{0} ; {1} ; {2}", emmEntityName, value, emmEntityId); } if (itemAttr.TryGetValue(attrName, out oldValue)) { itemAttr[attrName] = oldValue + " ;; " + value; } else { itemAttr.Add(attrName, value); } } } else { Utils.XmlSkip(reader, reader.Name); } } } // stopped? if (mStopped) { if (corpus.Documents.Count == 0) { return null; } break; } ProcessItem(itemAttr, corpus, url, xml); if (mMaxDocsPerCorpus > 0 && corpus.Documents.Count == mMaxDocsPerCorpus) { numNewItems += corpus.Documents.Count; foreach (KeyValuePair<string, string> attr in channelAttr) { corpus.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); DispatchData(corpus); corpus = new DocumentCorpus(); corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString()); } } } reader.Close(); if (corpus.Documents.Count > 0) { numNewItems += corpus.Documents.Count; foreach (KeyValuePair<string, string> attr in channelAttr) { corpus.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); DispatchData(corpus); } mLogger.Info("ProduceData", "{0} new items.", numNewItems); // stopped? if (mStopped) { return null; } } catch (Exception e) { mLogger.Error("ProduceData", e); } } return null; }
protected override void ConsumeData(IDataProducer sender, object data) { DocumentCorpus c = (DocumentCorpus)data; DataTable dt = CreateTable(); DataTable dtTextBlocks = CreateTextBlocksTable(); foreach (Document doc in c.Documents) { Document d = doc.Clone(); string rawHtml = d.Features.GetFeatureValue("raw"); DateTime time = DateTime.Parse(d.Features.GetFeatureValue("time")); Guid cGuid = new Guid(c.Features.GetFeatureValue("guid")); Guid dGuid = new Guid(d.Features.GetFeatureValue("guid")); ArrayList <byte> buffer = new ArrayList <byte>(); buffer.AddRange(cGuid.ToByteArray()); buffer.AddRange(dGuid.ToByteArray()); Guid docId = new Guid(MD5.Create().ComputeHash(buffer.ToArray())); d.Features.RemoveFeature("raw"); DateTime timeEnd = DateTime.Parse(c.Features.GetFeatureValue("timeEnd")); d.Features.SetFeatureValue("oldId", string.Format("{0:HH}_{0:mm}_{0:ss}_{1:N}_{2:N}", timeEnd, cGuid, dGuid)); d.Features.SetFeatureValue("hash", dGuid.ToString("N")); d.Features.SetFeatureValue("guid", docId.ToString("N")); d.Features.SetFeatureValue("rssUrl", c.Features.GetFeatureValue("sourceUrl")); d.Features.SetFeatureValue("siteId", c.Features.GetFeatureValue("siteId")); // remove boilerplate removal features, keep hash codes ArrayList <ulong> hashCodes = new ArrayList <ulong>(); foreach (Annotation annot in d.Annotations) { if (annot.Type.StartsWith("TextBlock")) { ulong hashCode = Convert.ToUInt64(annot.Features.GetFeatureValue("hash")); hashCodes.Add(hashCode); string linkToTextRatio = annot.Features.GetFeatureValue("linkToTextRatio"); string domPath = annot.Features.GetFeatureValue("domPath"); annot.Features.Clear(); annot.Features.SetFeatureValue("linkToTextRatio", linkToTextRatio); annot.Features.SetFeatureValue("domPath", domPath); } } // write doc XML if (mXmlDataRoot != null) { string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.xml.gz", mXmlDataRoot, time, docId); string path = new FileInfo(outFileName).DirectoryName; Directory.CreateDirectory(path); d.WriteXmlCompressed(outFileName); } // write raw HTML if (mHtmlDataRoot != null) { string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.html.gz", mHtmlDataRoot, time, docId); string path = new FileInfo(outFileName).DirectoryName; Directory.CreateDirectory(path); using (FileStream stream = new FileStream(outFileName, FileMode.Create)) { using (GZipStream gzStream = new GZipStream(stream, CompressionMode.Compress)) { using (BinaryWriter w = new BinaryWriter(gzStream)) { w.Write(Convert.FromBase64String(rawHtml)); } } } } // write view HTML if (mHtmlViewRoot != null) { string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.html", mHtmlViewRoot, time, docId); string path = new FileInfo(outFileName).DirectoryName.TrimEnd('\\'); Directory.CreateDirectory(path); if (!File.Exists(path + "\\Styles.css") || !File.Exists(path + "\\Code.js")) { string css = Utils.GetManifestResourceString(this.GetType(), "Styles.css"); string js = Utils.GetManifestResourceString(this.GetType(), "Code.js"); lock (mStaticLock) { File.WriteAllText(path + "\\Styles.css", css); File.WriteAllText(path + "\\Code.js", js); } } File.WriteAllText(outFileName, d.GetHtml(/*inlineCss=*/ false, /*inlineJs=*/ false), Encoding.UTF8); } // prepare for bulk write if (mConnectionString != null) { string fileName = string.Format("{0:yyyy}\\{0:MM}\\{0:dd}\\{0:HH}_{0:mm}_{0:ss}_{1:N}.xml.gz", time, docId); dt.Rows.Add( new Guid(d.Features.GetFeatureValue("guid")), dGuid, Utils.Truncate(d.Name, 400), Utils.Truncate(d.Features.GetFeatureValue("description"), 400), Utils.Truncate(d.Text, 1000), Utils.Truncate(d.Features.GetFeatureValue("category"), 400), Utils.Truncate(d.Features.GetFeatureValue("link"), 400), Utils.Truncate(d.Features.GetFeatureValue("responseUrl"), 400), Utils.Truncate(d.Features.GetFeatureValue("urlKey"), 400), DateTime.Parse(d.Features.GetFeatureValue("time")), Utils.Truncate(d.Features.GetFeatureValue("pubDate"), 100), Utils.Truncate(d.Features.GetFeatureValue("mimeType"), 80), Utils.Truncate(d.Features.GetFeatureValue("charSet"), 40), Convert.ToInt32(d.Features.GetFeatureValue("contentLength")), Utils.Truncate(d.Features.GetFeatureValue("domainName"), 100), Convert.ToInt32(d.Features.GetFeatureValue("bprBoilerplateCharCount")), Convert.ToInt32(d.Features.GetFeatureValue("bprContentCharCount")), Convert.ToInt32(d.Features.GetFeatureValue("unseenContentCharCount")), Convert.ToInt32(d.Features.GetFeatureValue("rev")), Utils.Truncate(fileName, 100), Utils.Truncate(c.Features.GetFeatureValue("siteId"), 100) ); BinarySerializer memSer = new BinarySerializer(); hashCodes.Save(memSer); byte[] hashCodesBinary = new byte[memSer.Stream.Position]; Array.Copy(((MemoryStream)memSer.Stream).GetBuffer(), hashCodesBinary, hashCodesBinary.Length); //string hashCodesBase64 = Convert.ToBase64String(hashCodesBinary, 0, (int)memSer.Stream.Position); // *** remove this after the transition dtTextBlocks.Rows.Add( new Guid(d.Features.GetFeatureValue("guid")), hashCodesBinary//, //hashCodesBase64 ); } } // bulk write to database if (mConnectionString != null && dt.Rows.Count > 0) { using (SqlConnection connection = new SqlConnection(mConnectionString)) { connection.Open(); using (SqlBulkCopy bulkWriter = new SqlBulkCopy(connection)) { bulkWriter.BulkCopyTimeout = mCommandTimeout; bulkWriter.DestinationTableName = "Documents"; bulkWriter.WriteToServerRetryOnDeadlock(dt); bulkWriter.DestinationTableName = "TextBlocks"; bulkWriter.WriteToServerRetryOnDeadlock(dtTextBlocks); } } } }
private void ProcessItem(Dictionary <string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml) { try { string name = ""; itemAttr.TryGetValue("title", out name); string desc = ""; itemAttr.TryGetValue("description", out desc); string pubDate = ""; itemAttr.TryGetValue("pubDate", out pubDate); Guid guid = MakeGuid(name, desc, pubDate); mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/ true)); if (!mHistory.CheckHistory(guid)) { DateTime time = DateTime.Now; string content = ""; if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "") { // get referenced Web page mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/ true)); string mimeType, charSet; string responseUrl; CookieContainer cookies = null; byte[] bytes = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/ null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl); if (bytes == null) { mLogger.Info("ProcessItem", "Item rejected because of its size."); mHistory.AddToHistory(guid, mSiteId); return; } ContentType contentType = GetContentType(mimeType); if ((contentType & mContentFilter) == 0) { mLogger.Info("ProcessItem", "Item rejected because of its content type."); mHistory.AddToHistory(guid, mSiteId); return; } itemAttr.Add("responseUrl", responseUrl); itemAttr.Add("mimeType", mimeType); itemAttr.Add("contentType", contentType.ToString()); if (charSet == null) { charSet = Config.rssReaderDefaultHtmlEncoding; } itemAttr.Add("charSet", charSet); itemAttr.Add("contentLength", bytes.Length.ToString()); if (contentType == ContentType.Binary) { // save as base64-encoded binary data content = Convert.ToBase64String(bytes); } else { // save as text content = GetEncoding(charSet).GetString(bytes); if (mIncludeRawData) { itemAttr.Add("raw", Convert.ToBase64String(bytes)); } } Thread.Sleep(mPolitenessSleep); } if (content == "") { if (itemAttr.ContainsKey("description")) { content = itemAttr["description"]; } else if (itemAttr.ContainsKey("title")) { content = itemAttr["title"]; } } itemAttr.Add("guid", guid.ToString()); itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE)); Document document = new Document(name, content); foreach (KeyValuePair <string, string> attr in itemAttr) { document.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.AddDocument(document); mHistory.AddToHistory(guid, mSiteId); } } catch (Exception e) { mLogger.Warn("ProcessItem", e); } }
protected override object ProduceData() { for (int i = 0; i < mSources.Count; i++) { string url = mSources[i]; int numNewItems = 0; try { DateTime timeStart = DateTime.Now; Dictionary <string, string> channelAttr = new Dictionary <string, string>(); // get RSS XML string xml; try { mLogger.Info("ProduceData", "Getting RSS XML from {0} ...", url); string mimeType, charSet; Encoding codePage = null; byte[] xmlBytes = WebUtils.GetWebResource(url, out mimeType, out charSet); //channelAttr.Add("debug", (charSet != null) ? "yes " : "no "); if (charSet == null) // charSet info not given { if (mRssXmlCodePageDetectorLanguage != null) { // get RSS XML as ASCII xml = FixXml(Encoding.GetEncoding("ISO-8859-1").GetString(xmlBytes)); // extract texts string content = ExtractRssXmlContent(xml); // try to guess code page ArrayList <KeyDat <double, LanguageProfile> > ldResult = mCodePageDetector.DetectLanguageAll(content); try { LanguageProfile bestLanguageProfile = ldResult .Where(x => x.Second.Language == mRssXmlCodePageDetectorLanguage) .OrderBy(x => x.First) .First() .Second; codePage = bestLanguageProfile.CodePage; } catch { } } } else { codePage = Encoding.GetEncoding(charSet); } if (codePage == null) { codePage = Encoding.GetEncoding(Config.rssReaderDefaultRssXmlEncoding); } xml = FixXml(codePage.GetString(xmlBytes)); } catch (Exception e) { mLogger.Error("ProduceData", e); return(null); } DocumentCorpus corpus = new DocumentCorpus(); corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString()); XmlTextReader reader = new XmlTextReader(new StringReader(xml)); // first pass: channel attributes ReadChannelAttributes(url, xml, timeStart, channelAttr); // second pass: items mLogger.Info("ProduceData", "Reading items ..."); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name == "item" && !reader.IsEmptyElement) { Dictionary <string, string> itemAttr = new Dictionary <string, string>(); while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "item")) { if (reader.NodeType == XmlNodeType.Element) { // handle item attributes if (mItemElements.Contains(reader.Name)) { string attrName = reader.Name; string emmTrigger = attrName == "category" ? reader.GetAttribute("emm:trigger") : null; string emmEntityId = attrName == "emm:entity" ? reader.GetAttribute("id") : null; string emmEntityName = attrName == "emm:entity" ? reader.GetAttribute("name") : null; string value = Utils.XmlReadValue(reader, attrName); if (value.Trim() != "") { string oldValue; if (attrName == "pubDate") { string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null) { value = tmp; } } if (emmTrigger != null) { value += " ; " + emmTrigger.Replace(';', ',').TrimEnd(' ', ','); } if (emmEntityId != null && emmEntityName != null) { value = string.Format("{0} ; {1} ; {2}", emmEntityName, value, emmEntityId); } if (itemAttr.TryGetValue(attrName, out oldValue)) { itemAttr[attrName] = oldValue + " ;; " + value; } else { itemAttr.Add(attrName, value); } } } else { Utils.XmlSkip(reader, reader.Name); } } } // stopped? if (mStopped) { if (corpus.Documents.Count == 0) { return(null); } break; } ProcessItem(itemAttr, corpus, url, xml); if (mMaxDocsPerCorpus > 0 && corpus.Documents.Count == mMaxDocsPerCorpus) { numNewItems += corpus.Documents.Count; foreach (KeyValuePair <string, string> attr in channelAttr) { corpus.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); DispatchData(corpus); corpus = new DocumentCorpus(); corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString()); } } } reader.Close(); if (corpus.Documents.Count > 0) { numNewItems += corpus.Documents.Count; foreach (KeyValuePair <string, string> attr in channelAttr) { corpus.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); DispatchData(corpus); } mLogger.Info("ProduceData", "{0} new items.", numNewItems); // stopped? if (mStopped) { return(null); } } catch (Exception e) { mLogger.Error("ProduceData", e); } } return(null); }
static void Main(string[] args) { Logger logger = Logger.GetRootLogger(); ZeroMqReceiverComponent zmqRcv = new ZeroMqReceiverComponent(delegate(string key) { if (key == "MessageSendAddress" || key == "ReceiveLoadBalancingAdress" || key == "FinishPublish") { return(null); } // ignore these settings return(ConfigurationManager.AppSettings.Get(key)); }); ZeroMqEmitterComponent zmqEmt = new ZeroMqEmitterComponent(delegate(string key) { if (key == "MessageReceiveAddress" || key == "SendLoadBalancingAddress" || key == "FinishReceive") { return(null); } // ignore these settings return(ConfigurationManager.AppSettings.Get(key)); }); PassOnComponent oldBranch = new PassOnComponent(); // first branch (goes to WP4) oldBranch.DispatchPolicy = DispatchPolicy.BalanceLoadMax; PassOnComponent bypass = new PassOnComponent(); // second branch ("bypass", writes to DB) bypass.DispatchPolicy = DispatchPolicy.BalanceLoadMax; zmqRcv.Subscribe(oldBranch); zmqRcv.Subscribe(bypass); for (int i = 0; i < NUM_PIPES; i++) { DocumentFilterComponent rcv = new DocumentFilterComponent(); rcv.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(delegate(Document doc, Logger log) { Console.WriteLine("RCV " + doc.Name); return(true); }); DocumentCategorizerComponent cc = new DocumentCategorizerComponent(); cc.BlockSelector = "TextBlock/Content"; DocumentFilterComponent dfc = new DocumentFilterComponent(); dfc.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(Filter); EntityRecognitionComponent erc = new EntityRecognitionComponent(ONTOLOGY_FOLDER); erc.BlockSelector = "TextBlock/Content"; DocumentFilterComponent snd = new DocumentFilterComponent(); snd.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(delegate(Document doc, Logger log) { Console.WriteLine("SND " + doc.Name + " [" + doc.Features.GetFeatureValue("fullId") + "]"); return(true); }); GenericStreamDataProcessor mkId = new GenericStreamDataProcessor(); mkId.OnProcessData += new GenericStreamDataProcessor.ProcessDataHandler(delegate(IDataProducer sender, object data) { DocumentCorpus c = (DocumentCorpus)data; string corpusId = c.Features.GetFeatureValue("guid").Replace("-", ""); DateTime timeEnd = DateTime.Parse(c.Features.GetFeatureValue("timeEnd")); foreach (Document d in c.Documents) { string docId = d.Features.GetFeatureValue("guid").Replace("-", ""); string fullId = timeEnd.ToString("HH_mm_ss_") + corpusId + "_" + docId; d.Features.SetFeatureValue("fullId", fullId); } return(data); }); oldBranch.Subscribe(rcv); rcv.Subscribe(cc); cc.Subscribe(dfc); dfc.Subscribe(erc); erc.Subscribe(mkId); mkId.Subscribe(snd); snd.Subscribe(zmqEmt); } // Petra's code Ontology o = new Ontology(ONTOLOGY_FOLDER_BYPASS); o.ToDb(CONNECTION_STRING_OCCURRENCE); // fill DB tables entity and class // end of Petra's code for (int i = 0; i < NUM_PIPES_BYPASS; i++) { // create components EntityRecognitionComponent erc = new EntityRecognitionComponent(ONTOLOGY_FOLDER_BYPASS); erc.BlockSelector = "TextBlock/Content"; OntologyCategorizerComponent occ = new OntologyCategorizerComponent(); PumpIndexComponent pic = new PumpIndexComponent(); OccurrenceWriterComponent owc = new OccurrenceWriterComponent(CONNECTION_STRING_OCCURRENCE); DocumentWriterComponent dwc = new DocumentWriterComponent(null, /*cmdTimeout=*/ 0, XML_DATA_ROOT, null); // build branch bypass.Subscribe(erc); erc.Subscribe(occ); occ.Subscribe(pic); pic.Subscribe(owc); pic.Subscribe(dwc); } zmqRcv.Start(); logger.Info("Main", "The pipeline is running."); }
private bool SendDocumentCorpusInfo(DocumentCorpus corpus) { // taken from Latino.Web WebUtils.cs HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://first-vm4.ijs.si/feed-form/"); request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6"; request.Accept = "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,*/*;q=0.5"; request.Headers.Add("Accept-Language", "en-us,en;q=0.5"); request.Headers.Add("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7"); // configure POST request request.CookieContainer = mCookies; request.Method = "POST"; StringBuilder postData = new StringBuilder(string.Format("csrfmiddlewaretoken={0}&form-TOTAL_FORMS={1}&form-INITIAL_FORMS=0", mCsrftoken, corpus.Documents.Count)); int i = 0; foreach (Document document in corpus.Documents) { string title = Utils.ToOneLine(document.Name, /*compact=*/true); TextBlock[] textBlocks = document.GetAnnotatedBlocks("TextBlock/Content"); StringBuilder text = new StringBuilder(); foreach (TextBlock textBlock in textBlocks) { if (IsSubstring(textBlock.Text, title) < 0.2) { text.AppendLine(textBlock.Text); if (text.Length > 600) { break; } } } //&form-0-url=...&form-0-title=...&form-0-source=...&form-0-snippet=...&form-0-timestamp=... string docData = string.Format("&form-{5}-url={0}&form-{5}-title={1}&form-{5}-source={2}&form-{5}-snippet={3}&form-{5}-timestamp={4}", HttpUtility.UrlEncode(document.Features.GetFeatureValue("responseUrl")), HttpUtility.UrlEncode(HttpUtility.HtmlEncode(title)), HttpUtility.UrlEncode(corpus.Features.GetFeatureValue("siteId")), HttpUtility.UrlEncode(HttpUtility.HtmlEncode(Utils.ToOneLine(Utils.Truncate(text.ToString(), 500), /*compact=*/true)) + " ..."), "2012-04-13+16%3A47%3A38", i++); postData.Append(docData); } //Console.WriteLine(postData.ToString()); byte[] buffer = Encoding.ASCII.GetBytes(postData.ToString()); request.ContentLength = buffer.Length; request.ContentType = "application/x-www-form-urlencoded"; Stream dataStream = request.GetRequestStream(); dataStream.Write(buffer, 0, buffer.Length); dataStream.Close(); // send request try { request.GetResponse().Close(); return true; } catch { return false; } }
protected override void ConsumeData(IDataProducer sender, object data) { Utils.ThrowException(!(data is DocumentCorpus) ? new ArgumentTypeException("data") : null); DocumentCorpus corpus = (DocumentCorpus)data; string corpusId = corpus.Features.GetFeatureValue("guid").Replace("-", ""); StringWriter stringWriter; XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; XmlWriter writer = XmlWriter.Create(stringWriter = new StringWriter(), xmlSettings); corpus.WriteXml(writer, /*writeTopElement=*/ true); writer.Close(); //DateTime now = DateTime.Now; //string recordId = now.ToString("HH_mm_ss_") + corpusId; DateTime timeEnd = DateTime.Parse(corpus.Features.GetFeatureValue("timeEnd")); string recordId = timeEnd.ToString("HH_mm_ss_") + corpusId; // write to file if (mXmlDataRoot != null) { string path = string.Format(@"{3}\{0}\{1}\{2}\", timeEnd.Year, timeEnd.Month, timeEnd.Day, mXmlDataRoot.TrimEnd('\\')); if (!Directory.Exists(path)) { lock (mLock) { if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } } } StreamWriter w = new StreamWriter(path + recordId + ".xml", /*append=*/ false, Encoding.UTF8); w.Write(stringWriter.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-16\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>")); w.Close(); } if (mHtmlDataRoot != null) { string pathHtml = string.Format(@"{4}\{0}\{1}\{2}\{3}\", timeEnd.Year, timeEnd.Month, timeEnd.Day, recordId, mHtmlDataRoot.TrimEnd('\\')); if (!Directory.Exists(pathHtml)) { lock (mLock) { if (!Directory.Exists(pathHtml)) { Directory.CreateDirectory(pathHtml); } } } corpus.MakeHtmlPage(pathHtml, /*inlineCss=*/ true); } // write to database if (mWriteToDatabase) { bool success = mConnection.ExecuteNonQuery("insert into Corpora (id, title, language, sourceUrl, timeStart, timeEnd, siteId, rejected) values (?, ?, ?, ?, ?, ?, ?, ?)", corpusId, Utils.Truncate(corpus.Features.GetFeatureValue("title"), 400), Utils.Truncate(corpus.Features.GetFeatureValue("language"), 100), Utils.Truncate(corpus.Features.GetFeatureValue("sourceUrl"), 400), Utils.Truncate(corpus.Features.GetFeatureValue("timeStart"), 26), Utils.Truncate(corpus.Features.GetFeatureValue("timeEnd"), 26), Utils.Truncate(corpus.Features.GetFeatureValue("siteId"), 100), mIsDumpWriter ); if (!success) { mLogger.Warn("ConsumeData", "Unable to write to database."); } foreach (Document document in corpus.Documents) { string documentId = new Guid(document.Features.GetFeatureValue("guid")).ToString("N"); string bpCharCountStr = document.Features.GetFeatureValue("bprBoilerplateCharCount"); string contentCharCountStr = document.Features.GetFeatureValue("bprContentCharCount"); string unseenContentCharCountStr = document.Features.GetFeatureValue("unseenContentCharCount"); string unseenContent = document.Features.GetFeatureValue("unseenContent"); success = mConnection.ExecuteNonQuery("insert into Documents (id, corpusId, name, description, category, link, responseUrl, urlKey, time, pubDate, mimeType, contentType, charSet, contentLength, detectedLanguage, detectedCharRange, domain, bpCharCount, contentCharCount, rejected, unseenContent, unseenContentCharCount, rev) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", documentId, corpusId, Utils.Truncate(document.Name, 400), Utils.Truncate(document.Features.GetFeatureValue("description"), 400), Utils.Truncate(document.Features.GetFeatureValue("category"), 400), Utils.Truncate(document.Features.GetFeatureValue("link"), 400), Utils.Truncate(document.Features.GetFeatureValue("responseUrl"), 400), Utils.Truncate(document.Features.GetFeatureValue("urlKey"), 400), Utils.Truncate(document.Features.GetFeatureValue("time"), 26), Utils.Truncate(document.Features.GetFeatureValue("pubDate"), 26), Utils.Truncate(document.Features.GetFeatureValue("mimeType"), 80), Utils.Truncate(document.Features.GetFeatureValue("contentType"), 40), Utils.Truncate(document.Features.GetFeatureValue("charSet"), 40), Convert.ToInt32(document.Features.GetFeatureValue("contentLength")), Utils.Truncate(document.Features.GetFeatureValue("detectedLanguage"), 100), Utils.Truncate(document.Features.GetFeatureValue("detectedCharRange"), 100), Utils.Truncate(document.Features.GetFeatureValue("domainName"), 100), bpCharCountStr == null ? null : (object)Convert.ToInt32(bpCharCountStr), contentCharCountStr == null ? null : (object)Convert.ToInt32(contentCharCountStr), mIsDumpWriter, Utils.Truncate(unseenContent, 20), unseenContentCharCountStr == null ? null : (object)Convert.ToInt32(unseenContentCharCountStr), Convert.ToInt32(document.Features.GetFeatureValue("rev")) ); if (!success) { mLogger.Warn("ConsumeData", "Unable to write to database."); } } } }