protected override object ProcessData(IDataProducer sender, object data)
 {
     Utils.ThrowException(!(data is string[]) ? new ArgumentTypeException("data") : null);
     DateTime timeStart = DateTime.Now;
     DocumentCorpus corpus = new DocumentCorpus();
     foreach (string line in (string[])data)
     {
         int splitIdx = line.IndexOfAny(new char[] { ' ', '\t', '\n' });
         Document doc;
         if (!mIsNamedDoc || splitIdx < 0)
         {
             doc = new Document("", line.Trim());
         }
         else
         {
             doc = new Document(line.Substring(0, splitIdx).Trim(), line.Substring(splitIdx).Trim());
         }
         doc.Features.SetFeatureValue("_time", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
         corpus.AddDocument(doc);
     }
     corpus.Features.SetFeatureValue("_provider", GetType().ToString());
     corpus.Features.SetFeatureValue("_isNamedDoc", mIsNamedDoc.ToString());
     corpus.Features.SetFeatureValue("_timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE));
     corpus.Features.SetFeatureValue("_timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
     return corpus;
 }
예제 #2
0
        public static List<String> convertFIRSTXMLtoGATE(String file)
        {
            List<String> documents = new List<string>();

            // conversion from FIRST XML to GATE XML
            // load FIRST XML corpus
            DocumentCorpus corpus = new DocumentCorpus();
            corpus.ReadXml(new XmlTextReader(new StreamReader(file)));
            // save documents as GATE XML
            XmlWriterSettings xmlSettings = new XmlWriterSettings();
            xmlSettings.Indent = true;
            xmlSettings.NewLineOnAttributes = true;
            xmlSettings.CheckCharacters = false;
            xmlSettings.Encoding = Encoding.UTF8;
            //int i = 0;
            foreach (Document doc in corpus.Documents)
            {
                StringBuilder docXML = new StringBuilder();
                XmlWriter writer = XmlWriter.Create(docXML, xmlSettings);
                doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true);
                //doc.WriteXml(writer, /*writeTopElement=*/true);
                String docstr = docXML.ToString();
                documents.Add(docstr);

                writer.Close();

            }

            return documents;
        }
예제 #3
0
        static void Main(string[] args)
        {
            // conversion from FIRST XML to GATE XML
            // load FIRST XML corpus
            DocumentCorpus corpus = new DocumentCorpus();
            corpus.ReadXml(new XmlTextReader(new StreamReader(@"D:\streamer\files\23_55_08_450f24c0969d49d2883fc17a6f4e2af0.xml")));
            // save documents as GATE XML
            XmlWriterSettings xmlSettings = new XmlWriterSettings();
            xmlSettings.Indent = true;
            xmlSettings.NewLineOnAttributes = true;
            xmlSettings.CheckCharacters = false;
            xmlSettings.Encoding = Encoding.UTF8;
            int i = 0;
            foreach (Document doc in corpus.Documents)
            {
                StreamWriter streamWriter = new StreamWriter(string.Format(@"D:\streamer\output\{0}.xml", ++i));
                XmlWriter writer = XmlWriter.Create(streamWriter, xmlSettings);
                doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true);
                //doc.WriteXml(writer, /*writeTopElement=*/true);
                String docstr = writer.ToString();

                writer.Close();
                streamWriter.Close();

            }
            Console.ReadKey();
        }
 public override void Start()
 {
     if (!IsRunning)
     {
         mThread = new Thread(new ThreadStart(
                                  delegate() {
             while (!mStopped && !mMessenger.isMessagingFinished())
             {
                 string message = mMessenger.getMessage();
                 if (message != null)
                 {
                     try
                     {
                         DocumentCorpus dc = new DocumentCorpus();
                         XmlReader reader  = new XmlTextReader(new StringReader(message));
                         dc.ReadXml(reader);
                         reader.Close();
                         DispatchData(dc);
                     }
                     catch (Exception e)
                     {
                         mLogger.Error("ZeroMqReceiverComponent", e);
                         //File.WriteAllText(@"C:\Users\Administrator\Desktop\err\" + Guid.NewGuid().ToString("N") + ".xml", message, Encoding.UTF8);
                     }
                 }
                 Thread.Sleep(1);
             }
         }
                                  ));
         mStopped = false;
         mThread.Start();
     }
 }
예제 #5
0
        static void Main(string[] args)
        {
            Console.WriteLine("hello worlds!");
            //X x = new X();
            //A a = new A();
            //B b = new B();
            //C c = new C();
            //D d = new D();
            //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer();
            //gsdc.OnConsumeData += delegate(IDataProducer sender, object data)
            //{
            //    Console.WriteLine((string)data);
            //};
            //Y y = new Y();

            //x.Subscribe(a);
            //a.Subscribe(b);
            //b.Subscribe(gsdc);

            //x.Subscribe(c);
            //c.Subscribe(d);
            //d.Subscribe(gsdc);

            //x.Start();
            //Console.ReadLine();
            //Console.WriteLine("stop");
            //x.GracefulStop();
            //Console.ReadLine();

            //DocumentCorpus corpus = new DocumentCorpus();
            //Document doc = new Document("This is a very short document. This is some boilerplate.");
            //corpus.Add(doc);
            //Annotation annot = new Annotation(0, 29, "content_block");
            ////doc.AddAnnotation(annot);
            //RegexTokenizerComponent tok = new RegexTokenizerComponent();
            //tok.ReceiveData(null, corpus);

            //Regex mCharsetRegex
            //    = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase);

            //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success);

            //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines");
            //rss.Start();

            Document doc  = new Document("name", "bla bla");
            Document doc2 = new Document("name2", "bla bla 2");

            doc.AddAnnotation(new Annotation(0, 100, "waka waka"));
            StringWriter   sw;
            XmlTextWriter  writer = new XmlTextWriter(sw = new StringWriter());
            DocumentCorpus c      = new DocumentCorpus();

            c.AddDocument(doc);
            c.AddDocument(doc2);
            c.WriteXml(writer);
            Console.WriteLine(sw);
        }
예제 #6
0
        private bool SendDocumentCorpusInfo(DocumentCorpus corpus)
        {
            // taken from Latino.Web WebUtils.cs
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://first-vm4.ijs.si/feed-form/");

            request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6";
            request.Accept    = "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,*/*;q=0.5";
            request.Headers.Add("Accept-Language", "en-us,en;q=0.5");
            request.Headers.Add("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7");
            // configure POST request
            request.CookieContainer = mCookies;
            request.Method          = "POST";
            StringBuilder postData = new StringBuilder(string.Format("csrfmiddlewaretoken={0}&form-TOTAL_FORMS={1}&form-INITIAL_FORMS=0", mCsrftoken, corpus.Documents.Count));
            int           i        = 0;

            foreach (Document document in corpus.Documents)
            {
                string        title      = Utils.ToOneLine(document.Name, /*compact=*/ true);
                TextBlock[]   textBlocks = document.GetAnnotatedBlocks("TextBlock/Content");
                StringBuilder text       = new StringBuilder();
                foreach (TextBlock textBlock in textBlocks)
                {
                    if (IsSubstring(textBlock.Text, title) < 0.2)
                    {
                        text.AppendLine(textBlock.Text);
                        if (text.Length > 600)
                        {
                            break;
                        }
                    }
                }
                //&form-0-url=...&form-0-title=...&form-0-source=...&form-0-snippet=...&form-0-timestamp=...
                string docData = string.Format("&form-{5}-url={0}&form-{5}-title={1}&form-{5}-source={2}&form-{5}-snippet={3}&form-{5}-timestamp={4}",
                                               HttpUtility.UrlEncode(document.Features.GetFeatureValue("responseUrl")),
                                               HttpUtility.UrlEncode(HttpUtility.HtmlEncode(title)),
                                               HttpUtility.UrlEncode(corpus.Features.GetFeatureValue("siteId")),
                                               HttpUtility.UrlEncode(HttpUtility.HtmlEncode(Utils.ToOneLine(Utils.Truncate(text.ToString(), 500), /*compact=*/ true)) + " ..."),
                                               "2012-04-13+16%3A47%3A38",
                                               i++);
                postData.Append(docData);
            }
            //Console.WriteLine(postData.ToString());
            byte[] buffer = Encoding.ASCII.GetBytes(postData.ToString());
            request.ContentLength = buffer.Length;
            request.ContentType   = "application/x-www-form-urlencoded";
            Stream dataStream = request.GetRequestStream();

            dataStream.Write(buffer, 0, buffer.Length);
            dataStream.Close();
            // send request
            try
            {
                request.GetResponse().Close();
                return(true);
            }
            catch { return(false); }
        }
예제 #7
0
        protected override void ConsumeData(IDataProducer sender, object data)
        {
            Utils.ThrowException(!(data is DocumentCorpus) ? new ArgumentTypeException("data") : null);
            DocumentCorpus corpus = (DocumentCorpus)data;

            if (mCsrftoken == null || !SendDocumentCorpusInfo(corpus))
            {
                GetDjangoCookie();
                SendDocumentCorpusInfo(corpus);
            }
        }
예제 #8
0
 public /*protected*/ override object ProcessData(IDataProducer sender, object data)
 {
     try
     {
         DocumentCorpus corpus         = (DocumentCorpus)data;
         DocumentCorpus filteredCorpus = new DocumentCorpus();
         DocumentCorpus dumpCorpus     = new DocumentCorpus();
         filteredCorpus.CopyFeaturesFrom(corpus);
         dumpCorpus.CopyFeaturesFrom(corpus);
         ArrayList <Document> dumpDocumentList = new ArrayList <Document>();
         foreach (Document document in corpus.Documents)
         {
             try
             {
                 if (OnFilterDocument != null)
                 {
                     if (!OnFilterDocument(document, mLogger))
                     {
                         dumpDocumentList.Add(document);
                     }
                 }
             }
             catch (Exception exception)
             {
                 mLogger.Error("ProcessDocument", exception);
             }
         }
         foreach (Document doc in dumpDocumentList)
         {
             corpus.Remove(doc);
             dumpCorpus.AddDocument(doc);
         }
         if (dumpCorpus.Documents.Count > 0)
         {
             WorkflowUtils.DispatchData(this, dumpCorpus, mCloneDumpOnFork, mDumpDispatchPolicy, mDumpDataConsumers, mLogger);
         }
         return(corpus.Documents.Count > 0 ? corpus : null);
     }
     catch (Exception exception)
     {
         mLogger.Error("ProcessData", exception);
         return(data);
     }
 }
        protected override void ConsumeData(IDataProducer sender, object data)
        {
            DocumentCorpus c = (DocumentCorpus)data;

            foreach (Document doc in c.Documents)
            {
                short sentenceNum = 0, blockNum = 0;
                int   tokensPerDocument = 0;
                //string documentId = doc.Features.GetFeatureValue("guid");
                //documentId = documentId.Replace("-", "");
                //doc.Features.SetFeatureValue("fullId", corpusId + "_" + documentId);             //add feature fullId for Achim

                string responseUrl = doc.Features.GetFeatureValue("responseUrl") ?? "";
                string urlKey      = doc.Features.GetFeatureValue("urlKey") ?? "";
                string title       = doc.Features.GetFeatureValue("title") ?? "";
                string domainName  = doc.Features.GetFeatureValue("domainName") ?? "";

                //********************* date = pubdate if |pubDate-timeGet|<3 days
                string   pubDate = doc.Features.GetFeatureValue("pubDate") ?? "";
                DateTime timeGet = DateTime.Parse(doc.Features.GetFeatureValue("time"));
                string   date    = timeGet.ToString("yyyy-MM-dd");
                try
                {
                    DateTime mPubDate = DateTime.Parse(pubDate);
                    if (DateTime.Compare(mPubDate, timeGet) < 0 && timeGet.Subtract(mPubDate).CompareTo(TimeSpan.FromDays(3)) < 0)
                    {
                        date = mPubDate.ToString("yyyy-MM-dd");
                    }
                }
                catch { } // supress errors

                //******************* Document to database
                double pumpDumpIndex = Convert.ToDouble(doc.Features.GetFeatureValue("pumpIndex"));
                bool   isFinancial   = doc.Features.GetFeatureValue("isFinancial") == "True";
                // compute new ID
                Guid             cGuid  = new Guid(c.Features.GetFeatureValue("guid"));
                Guid             dGuid  = new Guid(doc.Features.GetFeatureValue("guid"));
                ArrayList <byte> buffer = new ArrayList <byte>();
                buffer.AddRange(cGuid.ToByteArray());
                buffer.AddRange(dGuid.ToByteArray());
                Guid documentId = new Guid(MD5.Create().ComputeHash(buffer.ToArray()));
                long docId      = ToDb.DocumentToDb(mConnection, title, date, pubDate, timeGet.ToString("yyyy-MM-dd HH:mm"), responseUrl, urlKey, domainName, isFinancial, pumpDumpIndex, documentId);

                //******************* occurrences

                string blockSelector = "TextBlock/Content";
                string rev           = doc.Features.GetFeatureValue("rev");
                if (rev != "1")
                {
                    blockSelector = "TextBlock/Content/Unseen";
                }

                //******************** occurrence to database
                int documentNeg = 0, documentPoz = 0;

                doc.CreateAnnotationIndex();
                foreach (TextBlock tb in doc.GetAnnotatedBlocks(blockSelector)) //"TextBlock/Content" if rev = "1", else "TextBlock/Content/Unseen"
                {
                    int tokensPerBlock = doc.GetAnnotatedBlocks("Token", tb.SpanStart, tb.SpanEnd).Length;
                    tokensPerDocument += tokensPerBlock;
                    int blockNeg = 0, blockPoz = 0;
                    blockNum++;

                    foreach (TextBlock s in doc.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector within TextBlock tb
                    {
                        int sentenceNeg = 0;
                        int sentencePoz = 0;
                        sentenceNum++;
                        int tokensPerSentence = doc.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd).Length;
                        // sentiment object
                        foreach (TextBlock so in doc.GetAnnotatedBlocks("SentimentObject", s.SpanStart, s.SpanEnd)) // *** SentimentObject selector within sentence s
                        {
                            Annotation annot = so.Annotation;
                            //     string gazUri = annot.Features.GetFeatureValue("gazetteerUri");
                            string instUri = annot.Features.GetFeatureValue("instanceUri");
                            //     string instClassUri = annot.Features.GetFeatureValue("instanceClassUri");
                            string term = so.Text; // takole pa dobis dejanski tekst...
                            //   Console.WriteLine("\n" + gazUri + " \t" + instUri + " \t" + instClassUri + " \t" + term);
                            long occId = ToDb.OccurrenceToDb(mConnection, date, annot.SpanStart, annot.SpanEnd, sentenceNum, blockNum, docId, instUri);
                            ToDb.TermToDb(mConnection, occId, term);
                        }

                        // sentiment word
                        foreach (TextBlock so in doc.GetAnnotatedBlocks("SentimentWord", s.SpanStart, s.SpanEnd)) // *** SentimentWord selector within sentence s
                        {
                            Annotation annot        = so.Annotation;
                            string     gazUri       = annot.Features.GetFeatureValue("gazetteerUri");
                            string     instUri      = annot.Features.GetFeatureValue("instanceUri");
                            string     instClassUri = annot.Features.GetFeatureValue("instanceClassUri");
                            string     term         = so.Text; // takole pa dobis dejanski tekst...
                            //   Console.WriteLine("\n" + gazUri + " \t" + instUri + " \t" + instClassUri + " \t" + term);
                            if (instClassUri.EndsWith("PositiveWord"))
                            {
                                sentencePoz++;
                                blockPoz++;
                                documentPoz++;
                            }
                            else if (instClassUri.EndsWith("NegativeWord"))
                            {
                                sentenceNeg++;
                                blockNeg++;
                                documentNeg++;
                            }
                            // Insert into SQL table SentimentWordOccurrence
                            ToDb.SentimentWordOccurrenceToDb(mConnection, date, annot.SpanStart, annot.SpanEnd, sentenceNum, blockNum, docId, instUri);
                        }
                    }
                    // Insert into SQL table BlockSentiment
                    if (blockNeg != 0 || blockPoz != 0)
                    {
                        ToDb.BlockSentimentToDb(mConnection, docId, blockNum, blockPoz, blockNeg, tokensPerBlock);
                    }
                }
            }
        }
예제 #10
0
        static void Main(string[] args)
        {
            Console.WriteLine("hello worlds!");
            //X x = new X();
            //A a = new A();
            //B b = new B();
            //C c = new C();
            //D d = new D();
            //GenericStreamDataConsumer gsdc = new GenericStreamDataConsumer();
            //gsdc.OnConsumeData += delegate(IDataProducer sender, object data)
            //{
            //    Console.WriteLine((string)data);
            //};
            //Y y = new Y();

            //x.Subscribe(a);
            //a.Subscribe(b);
            //b.Subscribe(gsdc);

            //x.Subscribe(c);
            //c.Subscribe(d);
            //d.Subscribe(gsdc);

            //x.Start();
            //Console.ReadLine();
            //Console.WriteLine("stop");
            //x.GracefulStop();
            //Console.ReadLine();

            //DocumentCorpus corpus = new DocumentCorpus();
            //Document doc = new Document("This is a very short document. This is some boilerplate.");
            //corpus.Add(doc);
            //Annotation annot = new Annotation(0, 29, "content_block");
            ////doc.AddAnnotation(annot);
            //RegexTokenizerComponent tok = new RegexTokenizerComponent();
            //tok.ReceiveData(null, corpus);

            //Regex mCharsetRegex
            //    = new Regex(@"((charset)|(encoding))\s*=\s*(([""'](?<enc>[^""']+)[""'])|((?<enc>[^\s>""']+)))", RegexOptions.Compiled | RegexOptions.IgnoreCase);

            //Console.WriteLine(mCharsetRegex.Match(@"<?xml version=""1.0"" encoding=""ISO-8859-1""?>").Success);

            //RssFeedComponent rss = new RssFeedComponent(@"http://feeds.abcnews.com/abcnews/moneyheadlines");
            //rss.Start();

            Document doc = new Document("name", "bla bla");
            Document doc2 = new Document("name2", "bla bla 2");
            doc.AddAnnotation(new Annotation(0, 100, "waka waka"));
            StringWriter sw;
            XmlTextWriter writer = new XmlTextWriter(sw = new StringWriter());
            DocumentCorpus c = new DocumentCorpus();
            c.AddDocument(doc);
            c.AddDocument(doc2);
            c.WriteXml(writer);
            Console.WriteLine(sw);
        }
 private void ProcessItem(Dictionary<string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml)
 {
     try
     {
         string name = "";
         itemAttr.TryGetValue("title", out name);
         string desc = "";
         itemAttr.TryGetValue("description", out desc);
         string pubDate = "";
         itemAttr.TryGetValue("pubDate", out pubDate);
         Guid guid = MakeGuid(name, desc, pubDate);
         mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/true));
         if (mDbConnectionString != null)
         {
             string xmlHash = Utils.GetHashCode128(xml).ToString("N");
             string category = null;
             itemAttr.TryGetValue("category", out category);
             string entities = null;
             itemAttr.TryGetValue("emm:entity", out entities);
             using (SqlConnection connection = new SqlConnection(mDbConnectionString))
             {
                 connection.Open();
                 using (SqlCommand cmd = new SqlCommand("insert into Sources (siteId, docId, sourceUrl, category, entities, xmlHash) values (@siteId, @docId, @sourceUrl, @category, @entities, @xmlHash)", connection))
                 {
                     WorkflowUtils.AssignParamsToCommand(cmd,
                         "siteId", Utils.Truncate(mSiteId, 100),
                         "docId", guid.ToString("N"),
                         "sourceUrl", Utils.Truncate(rssXmlUrl, 400),
                         "category", category,
                         "entities", entities,
                         "xmlHash", xmlHash);
                     cmd.ExecuteNonQuery();
                 }
                 using (SqlCommand cmd = new SqlCommand("insert into RssXml (hash, xml) values (@hash, @xml)", connection))
                 {
                     WorkflowUtils.AssignParamsToCommand(cmd,
                         "hash", xmlHash,
                         "xml", xml);
                     cmd.ExecuteNonQuery();
                 }
             }
         }
         if (!mHistory.CheckHistory(guid))
         {
             DateTime time = DateTime.Now;
             string content = "";
             if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "")
             {
                 // get referenced Web page
                 mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/true));
                 string mimeType, charSet;
                 string responseUrl;
                 CookieContainer cookies = null;
                 byte[] bytes = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl);
                 if (bytes == null)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its size.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 ContentType contentType = GetContentType(mimeType);
                 if ((contentType & mContentFilter) == 0)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its content type.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 itemAttr.Add("responseUrl", responseUrl);
                 itemAttr.Add("mimeType", mimeType);
                 itemAttr.Add("contentType", contentType.ToString());
                 if (charSet == null) { charSet = "ISO-8859-1"; }
                 itemAttr.Add("charSet", charSet);
                 itemAttr.Add("contentLength", bytes.Length.ToString());
                 if (contentType == ContentType.Binary)
                 {
                     // save as base64-encoded binary data
                     content = Convert.ToBase64String(bytes);
                 }
                 else
                 {
                     // save as text
                     content = GetEncoding(charSet).GetString(bytes);
                     if (mIncludeRawData)
                     {
                         itemAttr.Add("raw", Convert.ToBase64String(bytes));
                     }
                 }
                 Thread.Sleep(mPolitenessSleep);
             }
             if (content == "")
             {
                 if (itemAttr.ContainsKey("description"))
                 {
                     content = itemAttr["description"];
                 }
                 else if (itemAttr.ContainsKey("title"))
                 {
                     content = itemAttr["title"];
                 }
             }
             itemAttr.Add("guid", guid.ToString());
             itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE));
             Document document = new Document(name, content);
             foreach (KeyValuePair<string, string> attr in itemAttr)
             {
                 document.Features.SetFeatureValue(attr.Key, attr.Value);
             }
             corpus.AddDocument(document);
             mHistory.AddToHistory(guid, mSiteId);
         }
     }
     catch (Exception e)
     {
         mLogger.Warn("ProcessItem", e);
     }
 }
예제 #12
0
 public string GetDoc(string corpusId, string docId, string format, bool rmvRaw, bool changesOnly, string corpusTime)
 {
     string dataPath = Utils.GetConfigValue("DataPath", ".");
     if (corpusId == null || corpusId.Replace("-", "").Length != 32) { return "*** Invalid corpus ID."; }
     corpusId = corpusId.Replace("-", "");
     if (docId == null || docId.Replace("-", "").Length != 32) { return "*** Invalid document ID."; }
     docId = docId.Replace("-", "");
     string[] fileNames = null;
     if (!string.IsNullOrEmpty(corpusTime))
     {
         try
         {
             DateTime dt = DateTime.Parse(corpusTime);
             string prefix = dt.ToString("HH_mm_ss_");
             string path = "\\" + dt.Year + "\\" + dt.Month + "\\" + dt.Day + "\\";
             string fileName = dataPath.TrimEnd('\\') + path + prefix + corpusId + ".xml";
             if (!Utils.VerifyFileNameOpen(fileName)) { return "*** Corpus not found."; }
             fileNames = new string[] { fileName };
         }
         catch { return "*** Unable to parse time."; }
     }
     if (fileNames == null) { fileNames = Directory.GetFiles(dataPath, "*" + corpusId + ".xml", SearchOption.AllDirectories); }
     if (fileNames.Length == 0) { return "*** Corpus not found."; }
     DocumentCorpus corpus = new DocumentCorpus();
     StreamReader reader = new StreamReader(fileNames[0]);
     XmlTextReader xmlReader = new XmlTextReader(reader);
     corpus.ReadXml(xmlReader);
     xmlReader.Close();
     reader.Close();
     Document document = null;
     foreach (Document doc in corpus.Documents)
     {
         if (new Guid(doc.Features.GetFeatureValue("guid")).ToString("N") == docId) { document = doc; break; }
     }
     if (document == null) { return "*** Document not found."; }
     if (rmvRaw) { document.Features.RemoveFeature("raw"); }
     string response;
     if (format == "html")
     {
         StringWriter writer = new StringWriter();
         document.MakeHtmlPage(writer, /*inlineCss=*/true);
         string html = new Regex(@"<!--back_button-->.*?<!--/back_button-->").Replace(writer.ToString(), "");
         response = html;
     }
     else if (format == "txt")
     {
         StringBuilder txt = new StringBuilder();
         string selector = "TextBlock/Content";
         if (changesOnly && document.Features.GetFeatureValue("rev") != "1") { selector = "TextBlock/Content/Unseen"; }
         foreach (TextBlock block in document.GetAnnotatedBlocks(selector))
         {
             txt.AppendLine(block.Text);
         }
         response = document.Name + "\r\n\r\n" + txt.ToString();
     }
     else
     {
         StringWriter writer = new StringWriter();
         XmlWriterSettings xmlSettings = new XmlWriterSettings();
         xmlSettings.Indent = true;
         xmlSettings.NewLineOnAttributes = true;
         xmlSettings.CheckCharacters = false;
         XmlWriter xmlWriter = XmlWriter.Create(writer, xmlSettings);
         if (format == "gate_xml")
         {
             document.WriteGateXml(xmlWriter, /*writeTopElement=*/true, /*removeBoilerplate=*/true);
             xmlWriter.Flush();
             response = writer.ToString();
         }
         else // xml
         {
             document.WriteXml(xmlWriter, /*writeTopElement=*/true);
             xmlWriter.Flush();
             response = writer.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-16\"?>",
                 "<?xml version=\"1.0\" encoding=\"utf-8\"?>");
         }
         xmlWriter.Close();
     }
     return response;
 }
예제 #13
0
 protected override object ProduceData()
 {
     // are we done?
     if (mCurrentDirIdx >= mDataDirs.Length)
     {
         Stop();
         return(null);
     }
     // do we need to get more files?
     if (mFiles == null)
     {
         mFiles = Directory.GetFiles(mDataDirs[mCurrentDirIdx], "*.xml");
         Array.Sort(mFiles);
     }
     // did we process all currently available files?
     if (mCurrentFileIdx >= mFiles.Length)
     {
         mFiles          = null;
         mCurrentFileIdx = 0;
         mCurrentDirIdx++;
         return(null);
     }
     try
     {
         // read next file
         mLogger.Info("ProduceData", "Reading " + mFiles[mCurrentFileIdx] + " ...");
         DocumentCorpus corpus = new DocumentCorpus();
         StreamReader   reader = new StreamReader(mFiles[mCurrentFileIdx]);
         corpus.ReadXml(new XmlTextReader(reader));
         //string fileName = new FileInfo(mFiles[mCurrentFileIdx]).Name;
         //string corpusId = new Guid(fileName.Split('_', '.')[3]).ToString();
         //corpus.Features.SetFeatureValue("guid", corpusId);
         reader.Close();
         // refresh corpus ID (to avoid conflicts)
         corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
         // remove underscores in feature names
         string[] tmp = new string[corpus.Features.Names.Count];
         corpus.Features.Names.CopyTo(tmp, /*index=*/ 0);
         foreach (string featureName in tmp)
         {
             if (featureName.StartsWith("_"))
             {
                 corpus.Features.SetFeatureValue(featureName.TrimStart('_'), corpus.Features.GetFeatureValue(featureName));
                 corpus.Features.RemoveFeature(featureName);
             }
         }
         foreach (Document doc in corpus.Documents)
         {
             // remove annotations
             doc.ClearAnnotations();
             // remove underscores in feature names
             tmp = new string[doc.Features.Names.Count];
             doc.Features.Names.CopyTo(tmp, /*index=*/ 0);
             foreach (string featureName in tmp)
             {
                 if (featureName.StartsWith("_"))
                 {
                     doc.Features.SetFeatureValue(featureName.TrimStart('_'), doc.Features.GetFeatureValue(featureName));
                     doc.Features.RemoveFeature(featureName);
                 }
             }
             // remove processing-specific features
             foreach (string featureName in new string[] {
                 "detectedLanguage",
                 "detectedCharRange",
                 "bprBoilerplateCharCount",
                 "bprContentCharCount",
                 "domainName",
                 "urlKey",
                 "rev",
                 "blacklisted"
             })
             {
                 doc.Features.RemoveFeature(featureName);
             }
             // if there's raw data available, reset the content
             string raw = doc.Features.GetFeatureValue("raw");
             if (raw != null)
             {
                 doc.Features.SetFeatureValue("contentType", "Html");
                 doc.Text = GetEncoding(doc.Features.GetFeatureValue("charSet")).GetString(Convert.FromBase64String(raw));
             }
         }
         mCurrentFileIdx++;
         while (WorkflowUtils.GetBranchLoadMax(this) > 10) // I'm giving it all she's got, Captain!
         {
             Thread.Sleep(1000);
         }
         return(corpus);
     }
     catch (Exception e)
     {
         mCurrentFileIdx++;
         throw e;
     }
 }
 protected override object ProduceData()
 {
     for (int i = 0; i < mSources.Count; i++)
     {
         string url = mSources[i];
         int numNewItems = 0;
         try
         {
             DateTime timeStart = DateTime.Now;
             // get RSS XML
             string xml;
             try
             {
                 mLogger.Info("ProduceData", "Getting RSS XML from {0} ...", url);
                 xml = WebUtils.GetWebPageDetectEncoding(url);
                 xml = FixXml(xml);
             }
             catch (Exception e)
             {
                 mLogger.Error("ProduceData", e);
                 return null;
             }
             Dictionary<string, string> channelAttr = new Dictionary<string, string>();
             DocumentCorpus corpus = new DocumentCorpus();
             corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
             XmlTextReader reader = new XmlTextReader(new StringReader(xml));
             // first pass: channel attributes
             ReadChannelAttributes(url, xml, timeStart, channelAttr);
             // second pass: items
             mLogger.Info("ProduceData", "Reading items ...");
             while (reader.Read())
             {
                 if (reader.NodeType == XmlNodeType.Element && reader.Name == "item" && !reader.IsEmptyElement)
                 {
                     Dictionary<string, string> itemAttr = new Dictionary<string, string>();
                     while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "item"))
                     {
                         if (reader.NodeType == XmlNodeType.Element)
                         {
                             // handle item attributes
                             if (mItemElements.Contains(reader.Name))
                             {
                                 string attrName = reader.Name;
                                 string emmTrigger = attrName == "category" ? reader.GetAttribute("emm:trigger") : null;
                                 string emmEntityId = attrName == "emm:entity" ? reader.GetAttribute("id") : null;
                                 string emmEntityName = attrName == "emm:entity" ? reader.GetAttribute("name") : null;
                                 string value = Utils.XmlReadValue(reader, attrName);
                                 if (value.Trim() != "")
                                 {
                                     string oldValue;
                                     if (attrName == "pubDate") { string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null) { value = tmp; } }
                                     if (emmTrigger != null)
                                     {
                                         value += " ; " + emmTrigger.Replace(';', ',').TrimEnd(' ', ',');
                                     }
                                     if (emmEntityId != null && emmEntityName != null)
                                     {
                                         value = string.Format("{0} ; {1} ; {2}", emmEntityName, value, emmEntityId);
                                     }
                                     if (itemAttr.TryGetValue(attrName, out oldValue))
                                     {
                                         itemAttr[attrName] = oldValue + " ;; " + value;
                                     }
                                     else
                                     {
                                         itemAttr.Add(attrName, value);
                                     }
                                 }
                             }
                             else
                             {
                                 Utils.XmlSkip(reader, reader.Name);
                             }
                         }
                     }
                     // stopped?
                     if (mStopped)
                     {
                         if (corpus.Documents.Count == 0) { return null; }
                         break;
                     }
                     ProcessItem(itemAttr, corpus, url, xml);
                     if (mMaxDocsPerCorpus > 0 && corpus.Documents.Count == mMaxDocsPerCorpus)
                     {
                         numNewItems += corpus.Documents.Count;
                         foreach (KeyValuePair<string, string> attr in channelAttr)
                         {
                             corpus.Features.SetFeatureValue(attr.Key, attr.Value);
                         }
                         corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
                         DispatchData(corpus);
                         corpus = new DocumentCorpus();
                         corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
                     }
                 }
             }
             reader.Close();
             if (corpus.Documents.Count > 0)
             {
                 numNewItems += corpus.Documents.Count;
                 foreach (KeyValuePair<string, string> attr in channelAttr)
                 {
                     corpus.Features.SetFeatureValue(attr.Key, attr.Value);
                 }
                 corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
                 DispatchData(corpus);
             }
             mLogger.Info("ProduceData", "{0} new items.", numNewItems);
             // stopped?
             if (mStopped) { return null; }
         }
         catch (Exception e)
         {
             mLogger.Error("ProduceData", e);
         }
     }
     return null;
 }
 public override void Start()
 {
     if (!IsRunning)
     {
         mThread = new Thread(new ThreadStart(
             delegate() {
                 while (!mStopped && !mMessenger.isMessagingFinished())
                 {
                     string message = mMessenger.getMessage();
                     if (message != null)
                     {
                         try
                         {
                             DocumentCorpus dc = new DocumentCorpus();
                             XmlReader reader = new XmlTextReader(new StringReader(message));
                             dc.ReadXml(reader);
                             reader.Close();
                             DispatchData(dc);
                         }
                         catch (Exception e)
                         {
                             mLogger.Error("ZeroMqReceiverComponent", e);
                             //File.WriteAllText(@"C:\Users\Administrator\Desktop\err\" + Guid.NewGuid().ToString("N") + ".xml", message, Encoding.UTF8);
                         }
                     }
                     Thread.Sleep(1);
                 }
             }
         ));
         mStopped = false;
         mThread.Start();
     }
 }
예제 #16
0
        protected override void ConsumeData(IDataProducer sender, object data)
        {
            DocumentCorpus c            = (DocumentCorpus)data;
            DataTable      dt           = CreateTable();
            DataTable      dtTextBlocks = CreateTextBlocksTable();

            foreach (Document doc in c.Documents)
            {
                Document         d       = doc.Clone();
                string           rawHtml = d.Features.GetFeatureValue("raw");
                DateTime         time    = DateTime.Parse(d.Features.GetFeatureValue("time"));
                Guid             cGuid   = new Guid(c.Features.GetFeatureValue("guid"));
                Guid             dGuid   = new Guid(d.Features.GetFeatureValue("guid"));
                ArrayList <byte> buffer  = new ArrayList <byte>();
                buffer.AddRange(cGuid.ToByteArray());
                buffer.AddRange(dGuid.ToByteArray());
                Guid docId = new Guid(MD5.Create().ComputeHash(buffer.ToArray()));
                d.Features.RemoveFeature("raw");
                DateTime timeEnd = DateTime.Parse(c.Features.GetFeatureValue("timeEnd"));
                d.Features.SetFeatureValue("oldId", string.Format("{0:HH}_{0:mm}_{0:ss}_{1:N}_{2:N}", timeEnd, cGuid, dGuid));
                d.Features.SetFeatureValue("hash", dGuid.ToString("N"));
                d.Features.SetFeatureValue("guid", docId.ToString("N"));
                d.Features.SetFeatureValue("rssUrl", c.Features.GetFeatureValue("sourceUrl"));
                d.Features.SetFeatureValue("siteId", c.Features.GetFeatureValue("siteId"));
                // remove boilerplate removal features, keep hash codes
                ArrayList <ulong> hashCodes = new ArrayList <ulong>();
                foreach (Annotation annot in d.Annotations)
                {
                    if (annot.Type.StartsWith("TextBlock"))
                    {
                        ulong hashCode = Convert.ToUInt64(annot.Features.GetFeatureValue("hash"));
                        hashCodes.Add(hashCode);
                        string linkToTextRatio = annot.Features.GetFeatureValue("linkToTextRatio");
                        string domPath         = annot.Features.GetFeatureValue("domPath");
                        annot.Features.Clear();
                        annot.Features.SetFeatureValue("linkToTextRatio", linkToTextRatio);
                        annot.Features.SetFeatureValue("domPath", domPath);
                    }
                }
                // write doc XML
                if (mXmlDataRoot != null)
                {
                    string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.xml.gz", mXmlDataRoot, time, docId);
                    string path        = new FileInfo(outFileName).DirectoryName;
                    Directory.CreateDirectory(path);
                    d.WriteXmlCompressed(outFileName);
                }
                // write raw HTML
                if (mHtmlDataRoot != null)
                {
                    string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.html.gz", mHtmlDataRoot, time, docId);
                    string path        = new FileInfo(outFileName).DirectoryName;
                    Directory.CreateDirectory(path);
                    using (FileStream stream = new FileStream(outFileName, FileMode.Create))
                    {
                        using (GZipStream gzStream = new GZipStream(stream, CompressionMode.Compress))
                        {
                            using (BinaryWriter w = new BinaryWriter(gzStream))
                            {
                                w.Write(Convert.FromBase64String(rawHtml));
                            }
                        }
                    }
                }
                // write view HTML
                if (mHtmlViewRoot != null)
                {
                    string outFileName = string.Format("{0}\\{1:yyyy}\\{1:MM}\\{1:dd}\\{1:HH}_{1:mm}_{1:ss}_{2:N}.html", mHtmlViewRoot, time, docId);
                    string path        = new FileInfo(outFileName).DirectoryName.TrimEnd('\\');
                    Directory.CreateDirectory(path);
                    if (!File.Exists(path + "\\Styles.css") || !File.Exists(path + "\\Code.js"))
                    {
                        string css = Utils.GetManifestResourceString(this.GetType(), "Styles.css");
                        string js  = Utils.GetManifestResourceString(this.GetType(), "Code.js");
                        lock (mStaticLock)
                        {
                            File.WriteAllText(path + "\\Styles.css", css);
                            File.WriteAllText(path + "\\Code.js", js);
                        }
                    }
                    File.WriteAllText(outFileName, d.GetHtml(/*inlineCss=*/ false, /*inlineJs=*/ false), Encoding.UTF8);
                }
                // prepare for bulk write
                if (mConnectionString != null)
                {
                    string fileName = string.Format("{0:yyyy}\\{0:MM}\\{0:dd}\\{0:HH}_{0:mm}_{0:ss}_{1:N}.xml.gz", time, docId);
                    dt.Rows.Add(
                        new Guid(d.Features.GetFeatureValue("guid")),
                        dGuid,
                        Utils.Truncate(d.Name, 400),
                        Utils.Truncate(d.Features.GetFeatureValue("description"), 400),
                        Utils.Truncate(d.Text, 1000),
                        Utils.Truncate(d.Features.GetFeatureValue("category"), 400),
                        Utils.Truncate(d.Features.GetFeatureValue("link"), 400),
                        Utils.Truncate(d.Features.GetFeatureValue("responseUrl"), 400),
                        Utils.Truncate(d.Features.GetFeatureValue("urlKey"), 400),
                        DateTime.Parse(d.Features.GetFeatureValue("time")),
                        Utils.Truncate(d.Features.GetFeatureValue("pubDate"), 100),
                        Utils.Truncate(d.Features.GetFeatureValue("mimeType"), 80),
                        Utils.Truncate(d.Features.GetFeatureValue("charSet"), 40),
                        Convert.ToInt32(d.Features.GetFeatureValue("contentLength")),
                        Utils.Truncate(d.Features.GetFeatureValue("domainName"), 100),
                        Convert.ToInt32(d.Features.GetFeatureValue("bprBoilerplateCharCount")),
                        Convert.ToInt32(d.Features.GetFeatureValue("bprContentCharCount")),
                        Convert.ToInt32(d.Features.GetFeatureValue("unseenContentCharCount")),
                        Convert.ToInt32(d.Features.GetFeatureValue("rev")),
                        Utils.Truncate(fileName, 100),
                        Utils.Truncate(c.Features.GetFeatureValue("siteId"), 100)
                        );
                    BinarySerializer memSer = new BinarySerializer();
                    hashCodes.Save(memSer);
                    byte[] hashCodesBinary = new byte[memSer.Stream.Position];
                    Array.Copy(((MemoryStream)memSer.Stream).GetBuffer(), hashCodesBinary, hashCodesBinary.Length);
                    //string hashCodesBase64 = Convert.ToBase64String(hashCodesBinary, 0, (int)memSer.Stream.Position); // *** remove this after the transition
                    dtTextBlocks.Rows.Add(
                        new Guid(d.Features.GetFeatureValue("guid")),
                        hashCodesBinary//,
                        //hashCodesBase64
                        );
                }
            }
            // bulk write to database
            if (mConnectionString != null && dt.Rows.Count > 0)
            {
                using (SqlConnection connection = new SqlConnection(mConnectionString))
                {
                    connection.Open();
                    using (SqlBulkCopy bulkWriter = new SqlBulkCopy(connection))
                    {
                        bulkWriter.BulkCopyTimeout      = mCommandTimeout;
                        bulkWriter.DestinationTableName = "Documents";
                        bulkWriter.WriteToServerRetryOnDeadlock(dt);
                        bulkWriter.DestinationTableName = "TextBlocks";
                        bulkWriter.WriteToServerRetryOnDeadlock(dtTextBlocks);
                    }
                }
            }
        }
예제 #17
0
 private void ProcessItem(Dictionary <string, string> itemAttr, DocumentCorpus corpus, string rssXmlUrl, string xml)
 {
     try
     {
         string name = "";
         itemAttr.TryGetValue("title", out name);
         string desc = "";
         itemAttr.TryGetValue("description", out desc);
         string pubDate = "";
         itemAttr.TryGetValue("pubDate", out pubDate);
         Guid guid = MakeGuid(name, desc, pubDate);
         mLogger.Info("ProcessItem", "Found item \"{0}\".", Utils.ToOneLine(name, /*compact=*/ true));
         if (!mHistory.CheckHistory(guid))
         {
             DateTime time    = DateTime.Now;
             string   content = "";
             if (itemAttr.ContainsKey("link") && itemAttr["link"].Trim() != "")
             {
                 // get referenced Web page
                 mLogger.Info("ProcessItem", "Getting HTML from {0} ...", Utils.ToOneLine(itemAttr["link"], /*compact=*/ true));
                 string          mimeType, charSet;
                 string          responseUrl;
                 CookieContainer cookies = null;
                 byte[]          bytes   = WebUtils.GetWebResource(itemAttr["link"], /*refUrl=*/ null, ref cookies, WebUtils.DefaultTimeout, out mimeType, out charSet, mSizeLimit, out responseUrl);
                 if (bytes == null)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its size.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 ContentType contentType = GetContentType(mimeType);
                 if ((contentType & mContentFilter) == 0)
                 {
                     mLogger.Info("ProcessItem", "Item rejected because of its content type.");
                     mHistory.AddToHistory(guid, mSiteId);
                     return;
                 }
                 itemAttr.Add("responseUrl", responseUrl);
                 itemAttr.Add("mimeType", mimeType);
                 itemAttr.Add("contentType", contentType.ToString());
                 if (charSet == null)
                 {
                     charSet = Config.rssReaderDefaultHtmlEncoding;
                 }
                 itemAttr.Add("charSet", charSet);
                 itemAttr.Add("contentLength", bytes.Length.ToString());
                 if (contentType == ContentType.Binary)
                 {
                     // save as base64-encoded binary data
                     content = Convert.ToBase64String(bytes);
                 }
                 else
                 {
                     // save as text
                     content = GetEncoding(charSet).GetString(bytes);
                     if (mIncludeRawData)
                     {
                         itemAttr.Add("raw", Convert.ToBase64String(bytes));
                     }
                 }
                 Thread.Sleep(mPolitenessSleep);
             }
             if (content == "")
             {
                 if (itemAttr.ContainsKey("description"))
                 {
                     content = itemAttr["description"];
                 }
                 else if (itemAttr.ContainsKey("title"))
                 {
                     content = itemAttr["title"];
                 }
             }
             itemAttr.Add("guid", guid.ToString());
             itemAttr.Add("time", time.ToString(Utils.DATE_TIME_SIMPLE));
             Document document = new Document(name, content);
             foreach (KeyValuePair <string, string> attr in itemAttr)
             {
                 document.Features.SetFeatureValue(attr.Key, attr.Value);
             }
             corpus.AddDocument(document);
             mHistory.AddToHistory(guid, mSiteId);
         }
     }
     catch (Exception e)
     {
         mLogger.Warn("ProcessItem", e);
     }
 }
예제 #18
0
 protected override object ProduceData()
 {
     for (int i = 0; i < mSources.Count; i++)
     {
         string url         = mSources[i];
         int    numNewItems = 0;
         try
         {
             DateTime timeStart = DateTime.Now;
             Dictionary <string, string> channelAttr = new Dictionary <string, string>();
             // get RSS XML
             string xml;
             try
             {
                 mLogger.Info("ProduceData", "Getting RSS XML from {0} ...", url);
                 string   mimeType, charSet;
                 Encoding codePage = null;
                 byte[]   xmlBytes = WebUtils.GetWebResource(url, out mimeType, out charSet);
                 //channelAttr.Add("debug", (charSet != null) ? "yes " : "no ");
                 if (charSet == null) // charSet info not given
                 {
                     if (mRssXmlCodePageDetectorLanguage != null)
                     {
                         // get RSS XML as ASCII
                         xml = FixXml(Encoding.GetEncoding("ISO-8859-1").GetString(xmlBytes));
                         // extract texts
                         string content = ExtractRssXmlContent(xml);
                         // try to guess code page
                         ArrayList <KeyDat <double, LanguageProfile> > ldResult = mCodePageDetector.DetectLanguageAll(content);
                         try
                         {
                             LanguageProfile bestLanguageProfile = ldResult
                                                                   .Where(x => x.Second.Language == mRssXmlCodePageDetectorLanguage)
                                                                   .OrderBy(x => x.First)
                                                                   .First()
                                                                   .Second;
                             codePage = bestLanguageProfile.CodePage;
                         }
                         catch { }
                     }
                 }
                 else
                 {
                     codePage = Encoding.GetEncoding(charSet);
                 }
                 if (codePage == null)
                 {
                     codePage = Encoding.GetEncoding(Config.rssReaderDefaultRssXmlEncoding);
                 }
                 xml = FixXml(codePage.GetString(xmlBytes));
             }
             catch (Exception e)
             {
                 mLogger.Error("ProduceData", e);
                 return(null);
             }
             DocumentCorpus corpus = new DocumentCorpus();
             corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
             XmlTextReader reader = new XmlTextReader(new StringReader(xml));
             // first pass: channel attributes
             ReadChannelAttributes(url, xml, timeStart, channelAttr);
             // second pass: items
             mLogger.Info("ProduceData", "Reading items ...");
             while (reader.Read())
             {
                 if (reader.NodeType == XmlNodeType.Element && reader.Name == "item" && !reader.IsEmptyElement)
                 {
                     Dictionary <string, string> itemAttr = new Dictionary <string, string>();
                     while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "item"))
                     {
                         if (reader.NodeType == XmlNodeType.Element)
                         {
                             // handle item attributes
                             if (mItemElements.Contains(reader.Name))
                             {
                                 string attrName      = reader.Name;
                                 string emmTrigger    = attrName == "category" ? reader.GetAttribute("emm:trigger") : null;
                                 string emmEntityId   = attrName == "emm:entity" ? reader.GetAttribute("id") : null;
                                 string emmEntityName = attrName == "emm:entity" ? reader.GetAttribute("name") : null;
                                 string value         = Utils.XmlReadValue(reader, attrName);
                                 if (value.Trim() != "")
                                 {
                                     string oldValue;
                                     if (attrName == "pubDate")
                                     {
                                         string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null)
                                         {
                                             value = tmp;
                                         }
                                     }
                                     if (emmTrigger != null)
                                     {
                                         value += " ; " + emmTrigger.Replace(';', ',').TrimEnd(' ', ',');
                                     }
                                     if (emmEntityId != null && emmEntityName != null)
                                     {
                                         value = string.Format("{0} ; {1} ; {2}", emmEntityName, value, emmEntityId);
                                     }
                                     if (itemAttr.TryGetValue(attrName, out oldValue))
                                     {
                                         itemAttr[attrName] = oldValue + " ;; " + value;
                                     }
                                     else
                                     {
                                         itemAttr.Add(attrName, value);
                                     }
                                 }
                             }
                             else
                             {
                                 Utils.XmlSkip(reader, reader.Name);
                             }
                         }
                     }
                     // stopped?
                     if (mStopped)
                     {
                         if (corpus.Documents.Count == 0)
                         {
                             return(null);
                         }
                         break;
                     }
                     ProcessItem(itemAttr, corpus, url, xml);
                     if (mMaxDocsPerCorpus > 0 && corpus.Documents.Count == mMaxDocsPerCorpus)
                     {
                         numNewItems += corpus.Documents.Count;
                         foreach (KeyValuePair <string, string> attr in channelAttr)
                         {
                             corpus.Features.SetFeatureValue(attr.Key, attr.Value);
                         }
                         corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
                         DispatchData(corpus);
                         corpus = new DocumentCorpus();
                         corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
                     }
                 }
             }
             reader.Close();
             if (corpus.Documents.Count > 0)
             {
                 numNewItems += corpus.Documents.Count;
                 foreach (KeyValuePair <string, string> attr in channelAttr)
                 {
                     corpus.Features.SetFeatureValue(attr.Key, attr.Value);
                 }
                 corpus.Features.SetFeatureValue("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
                 DispatchData(corpus);
             }
             mLogger.Info("ProduceData", "{0} new items.", numNewItems);
             // stopped?
             if (mStopped)
             {
                 return(null);
             }
         }
         catch (Exception e)
         {
             mLogger.Error("ProduceData", e);
         }
     }
     return(null);
 }
예제 #19
0
        static void Main(string[] args)
        {
            Logger logger = Logger.GetRootLogger();
            ZeroMqReceiverComponent zmqRcv = new ZeroMqReceiverComponent(delegate(string key) {
                if (key == "MessageSendAddress" || key == "ReceiveLoadBalancingAdress" || key == "FinishPublish")
                {
                    return(null);
                }                                                                                                                  // ignore these settings
                return(ConfigurationManager.AppSettings.Get(key));
            });
            ZeroMqEmitterComponent zmqEmt = new ZeroMqEmitterComponent(delegate(string key) {
                if (key == "MessageReceiveAddress" || key == "SendLoadBalancingAddress" || key == "FinishReceive")
                {
                    return(null);
                }                                                                                                                   // ignore these settings
                return(ConfigurationManager.AppSettings.Get(key));
            });
            PassOnComponent oldBranch = new PassOnComponent(); // first branch (goes to WP4)

            oldBranch.DispatchPolicy = DispatchPolicy.BalanceLoadMax;
            PassOnComponent bypass = new PassOnComponent(); // second branch ("bypass", writes to DB)

            bypass.DispatchPolicy = DispatchPolicy.BalanceLoadMax;
            zmqRcv.Subscribe(oldBranch);
            zmqRcv.Subscribe(bypass);
            for (int i = 0; i < NUM_PIPES; i++)
            {
                DocumentFilterComponent rcv = new DocumentFilterComponent();
                rcv.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(delegate(Document doc, Logger log) {
                    Console.WriteLine("RCV " + doc.Name);
                    return(true);
                });
                DocumentCategorizerComponent cc = new DocumentCategorizerComponent();
                cc.BlockSelector = "TextBlock/Content";
                DocumentFilterComponent dfc = new DocumentFilterComponent();
                dfc.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(Filter);
                EntityRecognitionComponent erc = new EntityRecognitionComponent(ONTOLOGY_FOLDER);
                erc.BlockSelector = "TextBlock/Content";
                DocumentFilterComponent snd = new DocumentFilterComponent();
                snd.OnFilterDocument += new DocumentFilterComponent.FilterDocumentHandler(delegate(Document doc, Logger log) {
                    Console.WriteLine("SND " + doc.Name + " [" + doc.Features.GetFeatureValue("fullId") + "]");
                    return(true);
                });
                GenericStreamDataProcessor mkId = new GenericStreamDataProcessor();
                mkId.OnProcessData += new GenericStreamDataProcessor.ProcessDataHandler(delegate(IDataProducer sender, object data) {
                    DocumentCorpus c = (DocumentCorpus)data;
                    string corpusId  = c.Features.GetFeatureValue("guid").Replace("-", "");
                    DateTime timeEnd = DateTime.Parse(c.Features.GetFeatureValue("timeEnd"));
                    foreach (Document d in c.Documents)
                    {
                        string docId  = d.Features.GetFeatureValue("guid").Replace("-", "");
                        string fullId = timeEnd.ToString("HH_mm_ss_") + corpusId + "_" + docId;
                        d.Features.SetFeatureValue("fullId", fullId);
                    }
                    return(data);
                });
                oldBranch.Subscribe(rcv);
                rcv.Subscribe(cc);
                cc.Subscribe(dfc);
                dfc.Subscribe(erc);
                erc.Subscribe(mkId);
                mkId.Subscribe(snd);
                snd.Subscribe(zmqEmt);
            }
            // Petra's code
            Ontology o = new Ontology(ONTOLOGY_FOLDER_BYPASS);

            o.ToDb(CONNECTION_STRING_OCCURRENCE);    // fill DB tables entity and class
            // end of Petra's code
            for (int i = 0; i < NUM_PIPES_BYPASS; i++)
            {
                // create components
                EntityRecognitionComponent erc = new EntityRecognitionComponent(ONTOLOGY_FOLDER_BYPASS);
                erc.BlockSelector = "TextBlock/Content";
                OntologyCategorizerComponent occ = new OntologyCategorizerComponent();
                PumpIndexComponent           pic = new PumpIndexComponent();
                OccurrenceWriterComponent    owc = new OccurrenceWriterComponent(CONNECTION_STRING_OCCURRENCE);
                DocumentWriterComponent      dwc = new DocumentWriterComponent(null, /*cmdTimeout=*/ 0, XML_DATA_ROOT, null);
                // build branch
                bypass.Subscribe(erc);
                erc.Subscribe(occ);
                occ.Subscribe(pic);
                pic.Subscribe(owc);
                pic.Subscribe(dwc);
            }
            zmqRcv.Start();
            logger.Info("Main", "The pipeline is running.");
        }
 private bool SendDocumentCorpusInfo(DocumentCorpus corpus)
 {
     // taken from Latino.Web WebUtils.cs
     HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://first-vm4.ijs.si/feed-form/");
     request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6";
     request.Accept = "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,*/*;q=0.5";
     request.Headers.Add("Accept-Language", "en-us,en;q=0.5");
     request.Headers.Add("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7");
     // configure POST request
     request.CookieContainer = mCookies;
     request.Method = "POST";
     StringBuilder postData = new StringBuilder(string.Format("csrfmiddlewaretoken={0}&form-TOTAL_FORMS={1}&form-INITIAL_FORMS=0", mCsrftoken, corpus.Documents.Count));
     int i = 0;
     foreach (Document document in corpus.Documents)
     {
         string title = Utils.ToOneLine(document.Name, /*compact=*/true);
         TextBlock[] textBlocks = document.GetAnnotatedBlocks("TextBlock/Content");
         StringBuilder text = new StringBuilder();
         foreach (TextBlock textBlock in textBlocks)
         {
             if (IsSubstring(textBlock.Text, title) < 0.2)
             {
                 text.AppendLine(textBlock.Text);
                 if (text.Length > 600) { break; }
             }
         }
         //&form-0-url=...&form-0-title=...&form-0-source=...&form-0-snippet=...&form-0-timestamp=...
         string docData = string.Format("&form-{5}-url={0}&form-{5}-title={1}&form-{5}-source={2}&form-{5}-snippet={3}&form-{5}-timestamp={4}",
             HttpUtility.UrlEncode(document.Features.GetFeatureValue("responseUrl")),
             HttpUtility.UrlEncode(HttpUtility.HtmlEncode(title)),
             HttpUtility.UrlEncode(corpus.Features.GetFeatureValue("siteId")),
             HttpUtility.UrlEncode(HttpUtility.HtmlEncode(Utils.ToOneLine(Utils.Truncate(text.ToString(), 500), /*compact=*/true)) + " ..."),
             "2012-04-13+16%3A47%3A38",
             i++);
         postData.Append(docData);
     }
     //Console.WriteLine(postData.ToString());
     byte[] buffer = Encoding.ASCII.GetBytes(postData.ToString());
     request.ContentLength = buffer.Length;
     request.ContentType = "application/x-www-form-urlencoded";
     Stream dataStream = request.GetRequestStream();
     dataStream.Write(buffer, 0, buffer.Length);
     dataStream.Close();
     // send request
     try
     {
         request.GetResponse().Close();
         return true;
     }
     catch { return false; }
 }
예제 #21
0
        protected override void ConsumeData(IDataProducer sender, object data)
        {
            Utils.ThrowException(!(data is DocumentCorpus) ? new ArgumentTypeException("data") : null);
            DocumentCorpus    corpus   = (DocumentCorpus)data;
            string            corpusId = corpus.Features.GetFeatureValue("guid").Replace("-", "");
            StringWriter      stringWriter;
            XmlWriterSettings xmlSettings = new XmlWriterSettings();

            xmlSettings.Indent = true;
            xmlSettings.NewLineOnAttributes = true;
            xmlSettings.CheckCharacters     = false;
            XmlWriter writer = XmlWriter.Create(stringWriter = new StringWriter(), xmlSettings);

            corpus.WriteXml(writer, /*writeTopElement=*/ true);
            writer.Close();
            //DateTime now = DateTime.Now;
            //string recordId = now.ToString("HH_mm_ss_") + corpusId;
            DateTime timeEnd  = DateTime.Parse(corpus.Features.GetFeatureValue("timeEnd"));
            string   recordId = timeEnd.ToString("HH_mm_ss_") + corpusId;

            // write to file
            if (mXmlDataRoot != null)
            {
                string path = string.Format(@"{3}\{0}\{1}\{2}\", timeEnd.Year, timeEnd.Month, timeEnd.Day, mXmlDataRoot.TrimEnd('\\'));
                if (!Directory.Exists(path))
                {
                    lock (mLock)
                    {
                        if (!Directory.Exists(path))
                        {
                            Directory.CreateDirectory(path);
                        }
                    }
                }
                StreamWriter w = new StreamWriter(path + recordId + ".xml", /*append=*/ false, Encoding.UTF8);
                w.Write(stringWriter.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-16\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>"));
                w.Close();
            }
            if (mHtmlDataRoot != null)
            {
                string pathHtml = string.Format(@"{4}\{0}\{1}\{2}\{3}\", timeEnd.Year, timeEnd.Month, timeEnd.Day, recordId, mHtmlDataRoot.TrimEnd('\\'));
                if (!Directory.Exists(pathHtml))
                {
                    lock (mLock)
                    {
                        if (!Directory.Exists(pathHtml))
                        {
                            Directory.CreateDirectory(pathHtml);
                        }
                    }
                }
                corpus.MakeHtmlPage(pathHtml, /*inlineCss=*/ true);
            }
            // write to database
            if (mWriteToDatabase)
            {
                bool success = mConnection.ExecuteNonQuery("insert into Corpora (id, title, language, sourceUrl, timeStart, timeEnd, siteId, rejected) values (?, ?, ?, ?, ?, ?, ?, ?)",
                                                           corpusId,
                                                           Utils.Truncate(corpus.Features.GetFeatureValue("title"), 400),
                                                           Utils.Truncate(corpus.Features.GetFeatureValue("language"), 100),
                                                           Utils.Truncate(corpus.Features.GetFeatureValue("sourceUrl"), 400),
                                                           Utils.Truncate(corpus.Features.GetFeatureValue("timeStart"), 26),
                                                           Utils.Truncate(corpus.Features.GetFeatureValue("timeEnd"), 26),
                                                           Utils.Truncate(corpus.Features.GetFeatureValue("siteId"), 100),
                                                           mIsDumpWriter
                                                           );
                if (!success)
                {
                    mLogger.Warn("ConsumeData", "Unable to write to database.");
                }
                foreach (Document document in corpus.Documents)
                {
                    string documentId                = new Guid(document.Features.GetFeatureValue("guid")).ToString("N");
                    string bpCharCountStr            = document.Features.GetFeatureValue("bprBoilerplateCharCount");
                    string contentCharCountStr       = document.Features.GetFeatureValue("bprContentCharCount");
                    string unseenContentCharCountStr = document.Features.GetFeatureValue("unseenContentCharCount");
                    string unseenContent             = document.Features.GetFeatureValue("unseenContent");
                    success = mConnection.ExecuteNonQuery("insert into Documents (id, corpusId, name, description, category, link, responseUrl, urlKey, time, pubDate, mimeType, contentType, charSet, contentLength, detectedLanguage, detectedCharRange, domain, bpCharCount, contentCharCount, rejected, unseenContent, unseenContentCharCount, rev) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                                                          documentId,
                                                          corpusId,
                                                          Utils.Truncate(document.Name, 400),
                                                          Utils.Truncate(document.Features.GetFeatureValue("description"), 400),
                                                          Utils.Truncate(document.Features.GetFeatureValue("category"), 400),
                                                          Utils.Truncate(document.Features.GetFeatureValue("link"), 400),
                                                          Utils.Truncate(document.Features.GetFeatureValue("responseUrl"), 400),
                                                          Utils.Truncate(document.Features.GetFeatureValue("urlKey"), 400),
                                                          Utils.Truncate(document.Features.GetFeatureValue("time"), 26),
                                                          Utils.Truncate(document.Features.GetFeatureValue("pubDate"), 26),
                                                          Utils.Truncate(document.Features.GetFeatureValue("mimeType"), 80),
                                                          Utils.Truncate(document.Features.GetFeatureValue("contentType"), 40),
                                                          Utils.Truncate(document.Features.GetFeatureValue("charSet"), 40),
                                                          Convert.ToInt32(document.Features.GetFeatureValue("contentLength")),
                                                          Utils.Truncate(document.Features.GetFeatureValue("detectedLanguage"), 100),
                                                          Utils.Truncate(document.Features.GetFeatureValue("detectedCharRange"), 100),
                                                          Utils.Truncate(document.Features.GetFeatureValue("domainName"), 100),
                                                          bpCharCountStr == null ? null : (object)Convert.ToInt32(bpCharCountStr),
                                                          contentCharCountStr == null ? null : (object)Convert.ToInt32(contentCharCountStr),
                                                          mIsDumpWriter,
                                                          Utils.Truncate(unseenContent, 20),
                                                          unseenContentCharCountStr == null ? null : (object)Convert.ToInt32(unseenContentCharCountStr),
                                                          Convert.ToInt32(document.Features.GetFeatureValue("rev"))
                                                          );
                    if (!success)
                    {
                        mLogger.Warn("ConsumeData", "Unable to write to database.");
                    }
                }
            }
        }