예제 #1
0
        static void Main(string[] args)
        {
            Console.WriteLine("hello worlds!");
            //X x = new X();
            //A a = new A();
            //B b = new B();
            //C c = new C();
            //D d = new D();
            //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer();
            //gsdc.OnConsumeData += delegate(IDataProducer sender, object data)
            //{
            //    Console.WriteLine((string)data);
            //};
            //Y y = new Y();

            //x.Subscribe(a);
            //a.Subscribe(b);
            //b.Subscribe(gsdc);

            //x.Subscribe(c);
            //c.Subscribe(d);
            //d.Subscribe(gsdc);

            //x.Start();
            //Console.ReadLine();
            //Console.WriteLine("stop");
            //x.GracefulStop();
            //Console.ReadLine();

            //DocumentCorpus corpus = new DocumentCorpus();
            //Document doc = new Document("This is a very short document. This is some boilerplate.");
            //corpus.Add(doc);
            //Annotation annot = new Annotation(0, 29, "content_block");
            ////doc.AddAnnotation(annot);
            //RegexTokenizerComponent tok = new RegexTokenizerComponent();
            //tok.ReceiveData(null, corpus);

            //Regex mCharsetRegex
            //    = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase);

            //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success);

            //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines");
            //rss.Start();

            Document doc  = new Document("name", "bla bla");
            Document doc2 = new Document("name2", "bla bla 2");

            doc.AddAnnotation(new Annotation(0, 100, "waka waka"));
            StringWriter   sw;
            XmlTextWriter  writer = new XmlTextWriter(sw = new StringWriter());
            DocumentCorpus c      = new DocumentCorpus();

            c.AddDocument(doc);
            c.AddDocument(doc2);
            c.WriteXml(writer);
            Console.WriteLine(sw);
        }
 protected override object ProcessData(IDataProducer sender, object data)
 {
     Utils.ThrowException(!(data is string[]) ? new ArgumentTypeException("data") : null);
     DateTime timeStart = DateTime.Now;
     DocumentCorpus corpus = new DocumentCorpus();
     foreach (string line in (string[])data)
     {
         int splitIdx = line.IndexOfAny(new char[] { ' ', '\t', '\n' });
         Document doc;
         if (!mIsNamedDoc || splitIdx < 0)
         {
             doc = new Document("", line.Trim());
         }
         else
         {
             doc = new Document(line.Substring(0, splitIdx).Trim(), line.Substring(splitIdx).Trim());
         }
         doc.Features.SetFeatureValue("_time", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
         corpus.AddDocument(doc);
     }
     corpus.Features.SetFeatureValue("_provider", GetType().ToString());
     corpus.Features.SetFeatureValue("_isNamedDoc", mIsNamedDoc.ToString());
     corpus.Features.SetFeatureValue("_timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE));
     corpus.Features.SetFeatureValue("_timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
     return corpus;
 }
예제 #3
0
 public /*protected*/ override object ProcessData(IDataProducer sender, object data)
 {
     try
     {
         DocumentCorpus corpus         = (DocumentCorpus)data;
         DocumentCorpus filteredCorpus = new DocumentCorpus();
         DocumentCorpus dumpCorpus     = new DocumentCorpus();
         filteredCorpus.CopyFeaturesFrom(corpus);
         dumpCorpus.CopyFeaturesFrom(corpus);
         ArrayList <Document> dumpDocumentList = new ArrayList <Document>();
         foreach (Document document in corpus.Documents)
         {
             try
             {
                 if (OnFilterDocument != null)
                 {
                     if (!OnFilterDocument(document, mLogger))
                     {
                         dumpDocumentList.Add(document);
                     }
                 }
             }
             catch (Exception exception)
             {
                 mLogger.Error("ProcessDocument", exception);
             }
         }
         foreach (Document doc in dumpDocumentList)
         {
             corpus.Remove(doc);
             dumpCorpus.AddDocument(doc);
         }
         if (dumpCorpus.Documents.Count > 0)
         {
             WorkflowUtils.DispatchData(this, dumpCorpus, mCloneDumpOnFork, mDumpDispatchPolicy, mDumpDataConsumers, mLogger);
         }
         return(corpus.Documents.Count > 0 ? corpus : null);
     }
     catch (Exception exception)
     {
         mLogger.Error("ProcessData", exception);
         return(data);
     }
 }
예제 #4
0
        static void Main(string[] args)
        {
            Console.WriteLine("hello worlds!");
            //X x = new X();
            //A a = new A();
            //B b = new B();
            //C c = new C();
            //D d = new D();
            //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer();
            //gsdc.OnConsumeData += delegate(IDataProducer sender, object data)
            //{
            //    Console.WriteLine((string)data);
            //};
            //Y y = new Y();

            //x.Subscribe(a);
            //a.Subscribe(b);
            //b.Subscribe(gsdc);

            //x.Subscribe(c);
            //c.Subscribe(d);
            //d.Subscribe(gsdc);

            //x.Start();
            //Console.ReadLine();
            //Console.WriteLine("stop");
            //x.GracefulStop();
            //Console.ReadLine();

            //DocumentCorpus corpus = new DocumentCorpus();
            //Document doc = new Document("This is a very short document. This is some boilerplate.");
            //corpus.Add(doc);
            //Annotation annot = new Annotation(0, 29, "content_block");
            ////doc.AddAnnotation(annot);
            //RegexTokenizerComponent tok = new RegexTokenizerComponent();
            //tok.ReceiveData(null, corpus);

            //Regex mCharsetRegex
            //    = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase);

            //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success);

            //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines");
            //rss.Start();

            Document doc = new Document("name", "bla bla");
            Document doc2 = new Document("name2", "bla bla 2");
            doc.AddAnnotation(new Annotation(0, 100, "waka waka"));
            StringWriter sw;
            XmlTextWriter writer = new XmlTextWriter(sw = new StringWriter());
            DocumentCorpus c = new DocumentCorpus();
            c.AddDocument(doc);
            c.AddDocument(doc2);
            c.WriteXml(writer);
            Console.WriteLine(sw);
        }
 private void ProcessItem(Dictionary<string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml)
 {
     try
     {
         string name = "";
         itemAttr.TryGetValue("title", out name);
         string desc = "";
         itemAttr.TryGetValue("description", out desc);
         string pubDate = "";
         itemAttr.TryGetValue("pubDate", out pubDate);
         Guid guid = MakeGuid(name, desc, pubDate);
         mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/true));
         if (mDbConnectionString != null)
         {
             string xmlHash = Utils.GetHashCode128(xml).ToString("N");
             string category = null;
             itemAttr.TryGetValue("category", out category);
             string entities = null;
             itemAttr.TryGetValue("emm:entity", out entities);
             using (SqlConnection connection = new SqlConnection(mDbConnectionString))
             {
                 connection.Open();
                 using (SqlCommand cmd = new SqlCommand("insert into Sources (siteId, docId, sourceUrl, category, entities, xmlHash) values (@siteId, @docId, @sourceUrl, @category, @entities, @xmlHash)", connection))
                 {
                     WorkflowUtils.AssignParamsToCommand(cmd,
                         "siteId", Utils.Truncate(mSiteId, 100),
                         "docId", guid.ToString("N"),
                         "sourceUrl", Utils.Truncate(rssXmlUrl, 400),
                         "category", category,
                         "entities", entities,
                         "xmlHash", xmlHash);
                     cmd.ExecuteNonQuery();
                 }
                 using (SqlCommand cmd = new SqlCommand("insert into RssXml (hash, xml) values (@hash, @xml)", connection))
                 {
                     WorkflowUtils.AssignParamsToCommand(cmd,
                         "hash", xmlHash,
                         "xml", xml);
                     cmd.ExecuteNonQuery();
                 }
             }
         }
         if (!mHistory.CheckHistory(guid))
         {
             DateTime time = DateTime.Now;
             string content = "";
             if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "")
             {
                 // get referenced Web page
                 mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/true));
                 string mimeType, charSet;
                 string responseUrl;
                 CookieContainer cookies = null;
                 byte[] bytes = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl);
                 if (bytes == null)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its size.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 ContentType contentType = GetContentType(mimeType);
                 if ((contentType & mContentFilter) == 0)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its content type.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 itemAttr.Add("responseUrl", responseUrl);
                 itemAttr.Add("mimeType", mimeType);
                 itemAttr.Add("contentType", contentType.ToString());
                 if (charSet == null) { charSet = "ISO-8859-1"; }
                 itemAttr.Add("charSet", charSet);
                 itemAttr.Add("contentLength", bytes.Length.ToString());
                 if (contentType == ContentType.Binary)
                 {
                     // save as base64-encoded binary data
                     content = Convert.ToBase64String(bytes);
                 }
                 else
                 {
                     // save as text
                     content = GetEncoding(charSet).GetString(bytes);
                     if (mIncludeRawData)
                     {
                         itemAttr.Add("raw", Convert.ToBase64String(bytes));
                     }
                 }
                 Thread.Sleep(mPolitenessSleep);
             }
             if (content == "")
             {
                 if (itemAttr.ContainsKey("description"))
                 {
                     content = itemAttr["description"];
                 }
                 else if (itemAttr.ContainsKey("title"))
                 {
                     content = itemAttr["title"];
                 }
             }
             itemAttr.Add("guid", guid.ToString());
             itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE));
             Document document = new Document(name, content);
             foreach (KeyValuePair<string, string> attr in itemAttr)
             {
                 document.Features.SetFeatureValue(attr.Key, attr.Value);
             }
             corpus.AddDocument(document);
             mHistory.AddToHistory(guid, mSiteId);
         }
     }
     catch (Exception e)
     {
         mLogger.Warn("ProcessItem", e);
     }
 }
예제 #6
0
 private void ProcessItem(Dictionary <string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml)
 {
     try
     {
         string name = "";
         itemAttr.TryGetValue("title", out name);
         string desc = "";
         itemAttr.TryGetValue("description", out desc);
         string pubDate = "";
         itemAttr.TryGetValue("pubDate", out pubDate);
         Guid guid = MakeGuid(name, desc, pubDate);
         mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/ true));
         if (!mHistory.CheckHistory(guid))
         {
             DateTime time    = DateTime.Now;
             string   content = "";
             if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "")
             {
                 // get referenced Web page
                 mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/ true));
                 string          mimeType, charSet;
                 string          responseUrl;
                 CookieContainer cookies = null;
                 byte[]          bytes   = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/ null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl);
                 if (bytes == null)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its size.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 ContentType contentType = GetContentType(mimeType);
                 if ((contentType & mContentFilter) == 0)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its content type.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 itemAttr.Add("responseUrl", responseUrl);
                 itemAttr.Add("mimeType", mimeType);
                 itemAttr.Add("contentType", contentType.ToString());
                 if (charSet == null)
                 {
                     charSet = Config.rssReaderDefaultHtmlEncoding;
                 }
                 itemAttr.Add("charSet", charSet);
                 itemAttr.Add("contentLength", bytes.Length.ToString());
                 if (contentType == ContentType.Binary)
                 {
                     // save as base64-encoded binary data
                     content = Convert.ToBase64String(bytes);
                 }
                 else
                 {
                     // save as text
                     content = GetEncoding(charSet).GetString(bytes);
                     if (mIncludeRawData)
                     {
                         itemAttr.Add("raw", Convert.ToBase64String(bytes));
                     }
                 }
                 Thread.Sleep(mPolitenessSleep);
             }
             if (content == "")
             {
                 if (itemAttr.ContainsKey("description"))
                 {
                     content = itemAttr["description"];
                 }
                 else if (itemAttr.ContainsKey("title"))
                 {
                     content = itemAttr["title"];
                 }
             }
             itemAttr.Add("guid", guid.ToString());
             itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE));
             Document document = new Document(name, content);
             foreach (KeyValuePair <string, string> attr in itemAttr)
             {
                 document.Features.SetFeatureValue(attr.Key, attr.Value);
             }
             corpus.AddDocument(document);
             mHistory.AddToHistory(guid, mSiteId);
         }
     }
     catch (Exception e)
     {
         mLogger.Warn("ProcessItem", e);
     }
 }