static void Main(string[] args) { Console.WriteLine("hello worlds!"); //X x = new X(); //A a = new A(); //B b = new B(); //C c = new C(); //D d = new D(); //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer(); //gsdc.OnConsumeData += delegate(IDataProducer sender, object data) //{ // Console.WriteLine((string)data); //}; //Y y = new Y(); //x.Subscribe(a); //a.Subscribe(b); //b.Subscribe(gsdc); //x.Subscribe(c); //c.Subscribe(d); //d.Subscribe(gsdc); //x.Start(); //Console.ReadLine(); //Console.WriteLine("stop"); //x.GracefulStop(); //Console.ReadLine(); //DocumentCorpus corpus = new DocumentCorpus(); //Document doc = new Document("This is a very short document. This is some boilerplate."); //corpus.Add(doc); //Annotation annot = new Annotation(0, 29, "content_block"); ////doc.AddAnnotation(annot); //RegexTokenizerComponent tok = new RegexTokenizerComponent(); //tok.ReceiveData(null, corpus); //Regex mCharsetRegex // = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase); //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success); //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines"); //rss.Start(); Document doc = new Document("name", "bla bla"); Document doc2 = new Document("name2", "bla bla 2"); doc.AddAnnotation(new Annotation(0, 100, "waka waka")); StringWriter sw; XmlTextWriter writer = new XmlTextWriter(sw = new StringWriter()); DocumentCorpus c = new DocumentCorpus(); c.AddDocument(doc); c.AddDocument(doc2); c.WriteXml(writer); Console.WriteLine(sw); }
protected override object ProcessData(IDataProducer sender, object data) { Utils.ThrowException(!(data is string[]) ? new ArgumentTypeException("data") : null); DateTime timeStart = DateTime.Now; DocumentCorpus corpus = new DocumentCorpus(); foreach (string line in (string[])data) { int splitIdx = line.IndexOfAny(new char[] { ' ', '\t', '\n' }); Document doc; if (!mIsNamedDoc || splitIdx < 0) { doc = new Document("", line.Trim()); } else { doc = new Document(line.Substring(0, splitIdx).Trim(), line.Substring(splitIdx).Trim()); } doc.Features.SetFeatureValue("_time", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); corpus.AddDocument(doc); } corpus.Features.SetFeatureValue("_provider", GetType().ToString()); corpus.Features.SetFeatureValue("_isNamedDoc", mIsNamedDoc.ToString()); corpus.Features.SetFeatureValue("_timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE)); corpus.Features.SetFeatureValue("_timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE)); return corpus; }
public /*protected*/ override object ProcessData(IDataProducer sender, object data) { try { DocumentCorpus corpus = (DocumentCorpus)data; DocumentCorpus filteredCorpus = new DocumentCorpus(); DocumentCorpus dumpCorpus = new DocumentCorpus(); filteredCorpus.CopyFeaturesFrom(corpus); dumpCorpus.CopyFeaturesFrom(corpus); ArrayList <Document> dumpDocumentList = new ArrayList <Document>(); foreach (Document document in corpus.Documents) { try { if (OnFilterDocument != null) { if (!OnFilterDocument(document, mLogger)) { dumpDocumentList.Add(document); } } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } } foreach (Document doc in dumpDocumentList) { corpus.Remove(doc); dumpCorpus.AddDocument(doc); } if (dumpCorpus.Documents.Count > 0) { WorkflowUtils.DispatchData(this, dumpCorpus, mCloneDumpOnFork, mDumpDispatchPolicy, mDumpDataConsumers, mLogger); } return(corpus.Documents.Count > 0 ? corpus : null); } catch (Exception exception) { mLogger.Error("ProcessData", exception); return(data); } }
private void ProcessItem(Dictionary<string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml) { try { string name = ""; itemAttr.TryGetValue("title", out name); string desc = ""; itemAttr.TryGetValue("description", out desc); string pubDate = ""; itemAttr.TryGetValue("pubDate", out pubDate); Guid guid = MakeGuid(name, desc, pubDate); mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/true)); if (mDbConnectionString != null) { string xmlHash = Utils.GetHashCode128(xml).ToString("N"); string category = null; itemAttr.TryGetValue("category", out category); string entities = null; itemAttr.TryGetValue("emm:entity", out entities); using (SqlConnection connection = new SqlConnection(mDbConnectionString)) { connection.Open(); using (SqlCommand cmd = new SqlCommand("insert into Sources (siteId, docId, sourceUrl, category, entities, xmlHash) values (@siteId, @docId, @sourceUrl, @category, @entities, @xmlHash)", connection)) { WorkflowUtils.AssignParamsToCommand(cmd, "siteId", Utils.Truncate(mSiteId, 100), "docId", guid.ToString("N"), "sourceUrl", Utils.Truncate(rssXmlUrl, 400), "category", category, "entities", entities, "xmlHash", xmlHash); cmd.ExecuteNonQuery(); } using (SqlCommand cmd = new SqlCommand("insert into RssXml (hash, xml) values (@hash, @xml)", connection)) { WorkflowUtils.AssignParamsToCommand(cmd, "hash", xmlHash, "xml", xml); cmd.ExecuteNonQuery(); } } } if (!mHistory.CheckHistory(guid)) { DateTime time = DateTime.Now; string content = ""; if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "") { // get referenced Web page mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/true)); string mimeType, charSet; string responseUrl; CookieContainer cookies = null; byte[] bytes = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl); if (bytes == null) { mLogger.Info("ProcessItem", "Item rejected because of its size."); mHistory.AddToHistory(guid, mSiteId); return; } ContentType contentType = GetContentType(mimeType); if ((contentType & mContentFilter) == 0) { mLogger.Info("ProcessItem", "Item rejected because of its content type."); mHistory.AddToHistory(guid, mSiteId); return; } itemAttr.Add("responseUrl", responseUrl); itemAttr.Add("mimeType", mimeType); itemAttr.Add("contentType", contentType.ToString()); if (charSet == null) { charSet = "ISO-8859-1"; } itemAttr.Add("charSet", charSet); itemAttr.Add("contentLength", bytes.Length.ToString()); if (contentType == ContentType.Binary) { // save as base64-encoded binary data content = Convert.ToBase64String(bytes); } else { // save as text content = GetEncoding(charSet).GetString(bytes); if (mIncludeRawData) { itemAttr.Add("raw", Convert.ToBase64String(bytes)); } } Thread.Sleep(mPolitenessSleep); } if (content == "") { if (itemAttr.ContainsKey("description")) { content = itemAttr["description"]; } else if (itemAttr.ContainsKey("title")) { content = itemAttr["title"]; } } itemAttr.Add("guid", guid.ToString()); itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE)); Document document = new Document(name, content); foreach (KeyValuePair<string, string> attr in itemAttr) { document.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.AddDocument(document); mHistory.AddToHistory(guid, mSiteId); } } catch (Exception e) { mLogger.Warn("ProcessItem", e); } }
private void ProcessItem(Dictionary <string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml) { try { string name = ""; itemAttr.TryGetValue("title", out name); string desc = ""; itemAttr.TryGetValue("description", out desc); string pubDate = ""; itemAttr.TryGetValue("pubDate", out pubDate); Guid guid = MakeGuid(name, desc, pubDate); mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/ true)); if (!mHistory.CheckHistory(guid)) { DateTime time = DateTime.Now; string content = ""; if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "") { // get referenced Web page mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/ true)); string mimeType, charSet; string responseUrl; CookieContainer cookies = null; byte[] bytes = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/ null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl); if (bytes == null) { mLogger.Info("ProcessItem", "Item rejected because of its size."); mHistory.AddToHistory(guid, mSiteId); return; } ContentType contentType = GetContentType(mimeType); if ((contentType & mContentFilter) == 0) { mLogger.Info("ProcessItem", "Item rejected because of its content type."); mHistory.AddToHistory(guid, mSiteId); return; } itemAttr.Add("responseUrl", responseUrl); itemAttr.Add("mimeType", mimeType); itemAttr.Add("contentType", contentType.ToString()); if (charSet == null) { charSet = Config.rssReaderDefaultHtmlEncoding; } itemAttr.Add("charSet", charSet); itemAttr.Add("contentLength", bytes.Length.ToString()); if (contentType == ContentType.Binary) { // save as base64-encoded binary data content = Convert.ToBase64String(bytes); } else { // save as text content = GetEncoding(charSet).GetString(bytes); if (mIncludeRawData) { itemAttr.Add("raw", Convert.ToBase64String(bytes)); } } Thread.Sleep(mPolitenessSleep); } if (content == "") { if (itemAttr.ContainsKey("description")) { content = itemAttr["description"]; } else if (itemAttr.ContainsKey("title")) { content = itemAttr["title"]; } } itemAttr.Add("guid", guid.ToString()); itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE)); Document document = new Document(name, content); foreach (KeyValuePair <string, string> attr in itemAttr) { document.Features.SetFeatureValue(attr.Key, attr.Value); } corpus.AddDocument(document); mHistory.AddToHistory(guid, mSiteId); } } catch (Exception e) { mLogger.Warn("ProcessItem", e); } }