public override void Start()
 {
     if (!IsRunning)
     {
         mThread = new Thread(new ThreadStart(
                                  delegate() {
             while (!mStopped && !mMessenger.isMessagingFinished())
             {
                 string message = mMessenger.getMessage();
                 if (message != null)
                 {
                     try
                     {
                         DocumentCorpus dc = new DocumentCorpus();
                         XmlReader reader  = new XmlTextReader(new StringReader(message));
                         dc.ReadXml(reader);
                         reader.Close();
                         DispatchData(dc);
                     }
                     catch (Exception e)
                     {
                         mLogger.Error("ZeroMqReceiverComponent", e);
                         //File.WriteAllText(@"C:\Users\Administrator\Desktop\err\" + Guid.NewGuid().ToString("N") + ".xml", message, Encoding.UTF8);
                     }
                 }
                 Thread.Sleep(1);
             }
         }
                                  ));
         mStopped = false;
         mThread.Start();
     }
 }
        public static List<String> convertFIRSTXMLtoGATE(String file)
        {
            List<String> documents = new List<string>();

            // conversion from FIRST XML to GATE XML
            // load FIRST XML corpus
            DocumentCorpus corpus = new DocumentCorpus();
            corpus.ReadXml(new XmlTextReader(new StreamReader(file)));
            // save documents as GATE XML
            XmlWriterSettings xmlSettings = new XmlWriterSettings();
            xmlSettings.Indent = true;
            xmlSettings.NewLineOnAttributes = true;
            xmlSettings.CheckCharacters = false;
            xmlSettings.Encoding = Encoding.UTF8;
            //int i = 0;
            foreach (Document doc in corpus.Documents)
            {
                StringBuilder docXML = new StringBuilder();
                XmlWriter writer = XmlWriter.Create(docXML, xmlSettings);
                doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true);
                //doc.WriteXml(writer, /*writeTopElement=*/true);
                String docstr = docXML.ToString();
                documents.Add(docstr);

                writer.Close();

            }

            return documents;
        }
示例#3
0
        static void Main(string[] args)
        {
            // conversion from FIRST XML to GATE XML
            // load FIRST XML corpus
            DocumentCorpus corpus = new DocumentCorpus();
            corpus.ReadXml(new XmlTextReader(new StreamReader(@"D:\streamer\files\23_55_08_450f24c0969d49d2883fc17a6f4e2af0.xml")));
            // save documents as GATE XML
            XmlWriterSettings xmlSettings = new XmlWriterSettings();
            xmlSettings.Indent = true;
            xmlSettings.NewLineOnAttributes = true;
            xmlSettings.CheckCharacters = false;
            xmlSettings.Encoding = Encoding.UTF8;
            int i = 0;
            foreach (Document doc in corpus.Documents)
            {
                StreamWriter streamWriter = new StreamWriter(string.Format(@"D:\streamer\output\{0}.xml", ++i));
                XmlWriter writer = XmlWriter.Create(streamWriter, xmlSettings);
                doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true);
                //doc.WriteXml(writer, /*writeTopElement=*/true);
                String docstr = writer.ToString();

                writer.Close();
                streamWriter.Close();

            }
            Console.ReadKey();
        }
示例#4
0
 public string GetDoc(string corpusId, string docId, string format, bool rmvRaw, bool changesOnly, string corpusTime)
 {
     string dataPath = Utils.GetConfigValue("DataPath", ".");
     if (corpusId == null || corpusId.Replace("-", "").Length != 32) { return "*** Invalid corpus ID."; }
     corpusId = corpusId.Replace("-", "");
     if (docId == null || docId.Replace("-", "").Length != 32) { return "*** Invalid document ID."; }
     docId = docId.Replace("-", "");
     string[] fileNames = null;
     if (!string.IsNullOrEmpty(corpusTime))
     {
         try
         {
             DateTime dt = DateTime.Parse(corpusTime);
             string prefix = dt.ToString("HH_mm_ss_");
             string path = "\\" + dt.Year + "\\" + dt.Month + "\\" + dt.Day + "\\";
             string fileName = dataPath.TrimEnd('\\') + path + prefix + corpusId + ".xml";
             if (!Utils.VerifyFileNameOpen(fileName)) { return "*** Corpus not found."; }
             fileNames = new string[] { fileName };
         }
         catch { return "*** Unable to parse time."; }
     }
     if (fileNames == null) { fileNames = Directory.GetFiles(dataPath, "*" + corpusId + ".xml", SearchOption.AllDirectories); }
     if (fileNames.Length == 0) { return "*** Corpus not found."; }
     DocumentCorpus corpus = new DocumentCorpus();
     StreamReader reader = new StreamReader(fileNames[0]);
     XmlTextReader xmlReader = new XmlTextReader(reader);
     corpus.ReadXml(xmlReader);
     xmlReader.Close();
     reader.Close();
     Document document = null;
     foreach (Document doc in corpus.Documents)
     {
         if (new Guid(doc.Features.GetFeatureValue("guid")).ToString("N") == docId) { document = doc; break; }
     }
     if (document == null) { return "*** Document not found."; }
     if (rmvRaw) { document.Features.RemoveFeature("raw"); }
     string response;
     if (format == "html")
     {
         StringWriter writer = new StringWriter();
         document.MakeHtmlPage(writer, /*inlineCss=*/true);
         string html = new Regex(@"<!--back_button-->.*?<!--/back_button-->").Replace(writer.ToString(), "");
         response = html;
     }
     else if (format == "txt")
     {
         StringBuilder txt = new StringBuilder();
         string selector = "TextBlock/Content";
         if (changesOnly && document.Features.GetFeatureValue("rev") != "1") { selector = "TextBlock/Content/Unseen"; }
         foreach (TextBlock block in document.GetAnnotatedBlocks(selector))
         {
             txt.AppendLine(block.Text);
         }
         response = document.Name + "\r\n\r\n" + txt.ToString();
     }
     else
     {
         StringWriter writer = new StringWriter();
         XmlWriterSettings xmlSettings = new XmlWriterSettings();
         xmlSettings.Indent = true;
         xmlSettings.NewLineOnAttributes = true;
         xmlSettings.CheckCharacters = false;
         XmlWriter xmlWriter = XmlWriter.Create(writer, xmlSettings);
         if (format == "gate_xml")
         {
             document.WriteGateXml(xmlWriter, /*writeTopElement=*/true, /*removeBoilerplate=*/true);
             xmlWriter.Flush();
             response = writer.ToString();
         }
         else // xml
         {
             document.WriteXml(xmlWriter, /*writeTopElement=*/true);
             xmlWriter.Flush();
             response = writer.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-16\"?>",
                 "<?xml version=\"1.0\" encoding=\"utf-8\"?>");
         }
         xmlWriter.Close();
     }
     return response;
 }
 protected override object ProduceData()
 {
     // are we done?
     if (mCurrentDirIdx >= mDataDirs.Length)
     {
         Stop();
         return(null);
     }
     // do we need to get more files?
     if (mFiles == null)
     {
         mFiles = Directory.GetFiles(mDataDirs[mCurrentDirIdx], "*.xml");
         Array.Sort(mFiles);
     }
     // did we process all currently available files?
     if (mCurrentFileIdx >= mFiles.Length)
     {
         mFiles          = null;
         mCurrentFileIdx = 0;
         mCurrentDirIdx++;
         return(null);
     }
     try
     {
         // read next file
         mLogger.Info("ProduceData", "Reading " + mFiles[mCurrentFileIdx] + " ...");
         DocumentCorpus corpus = new DocumentCorpus();
         StreamReader   reader = new StreamReader(mFiles[mCurrentFileIdx]);
         corpus.ReadXml(new XmlTextReader(reader));
         //string fileName = new FileInfo(mFiles[mCurrentFileIdx]).Name;
         //string corpusId = new Guid(fileName.Split('_', '.')[3]).ToString();
         //corpus.Features.SetFeatureValue("guid", corpusId);
         reader.Close();
         // refresh corpus ID (to avoid conflicts)
         corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
         // remove underscores in feature names
         string[] tmp = new string[corpus.Features.Names.Count];
         corpus.Features.Names.CopyTo(tmp, /*index=*/ 0);
         foreach (string featureName in tmp)
         {
             if (featureName.StartsWith("_"))
             {
                 corpus.Features.SetFeatureValue(featureName.TrimStart('_'), corpus.Features.GetFeatureValue(featureName));
                 corpus.Features.RemoveFeature(featureName);
             }
         }
         foreach (Document doc in corpus.Documents)
         {
             // remove annotations
             doc.ClearAnnotations();
             // remove underscores in feature names
             tmp = new string[doc.Features.Names.Count];
             doc.Features.Names.CopyTo(tmp, /*index=*/ 0);
             foreach (string featureName in tmp)
             {
                 if (featureName.StartsWith("_"))
                 {
                     doc.Features.SetFeatureValue(featureName.TrimStart('_'), doc.Features.GetFeatureValue(featureName));
                     doc.Features.RemoveFeature(featureName);
                 }
             }
             // remove processing-specific features
             foreach (string featureName in new string[] {
                 "detectedLanguage",
                 "detectedCharRange",
                 "bprBoilerplateCharCount",
                 "bprContentCharCount",
                 "domainName",
                 "urlKey",
                 "rev",
                 "blacklisted"
             })
             {
                 doc.Features.RemoveFeature(featureName);
             }
             // if there's raw data available, reset the content
             string raw = doc.Features.GetFeatureValue("raw");
             if (raw != null)
             {
                 doc.Features.SetFeatureValue("contentType", "Html");
                 doc.Text = GetEncoding(doc.Features.GetFeatureValue("charSet")).GetString(Convert.FromBase64String(raw));
             }
         }
         mCurrentFileIdx++;
         while (WorkflowUtils.GetBranchLoadMax(this) > 10) // I'm giving it all she's got, Captain!
         {
             Thread.Sleep(1000);
         }
         return(corpus);
     }
     catch (Exception e)
     {
         mCurrentFileIdx++;
         throw e;
     }
 }
 public override void Start()
 {
     if (!IsRunning)
     {
         mThread = new Thread(new ThreadStart(
             delegate() {
                 while (!mStopped && !mMessenger.isMessagingFinished())
                 {
                     string message = mMessenger.getMessage();
                     if (message != null)
                     {
                         try
                         {
                             DocumentCorpus dc = new DocumentCorpus();
                             XmlReader reader = new XmlTextReader(new StringReader(message));
                             dc.ReadXml(reader);
                             reader.Close();
                             DispatchData(dc);
                         }
                         catch (Exception e)
                         {
                             mLogger.Error("ZeroMqReceiverComponent", e);
                             //File.WriteAllText(@"C:\Users\Administrator\Desktop\err\" + Guid.NewGuid().ToString("N") + ".xml", message, Encoding.UTF8);
                         }
                     }
                     Thread.Sleep(1);
                 }
             }
         ));
         mStopped = false;
         mThread.Start();
     }
 }