public override void Start() { if (!IsRunning) { mThread = new Thread(new ThreadStart( delegate() { while (!mStopped && !mMessenger.isMessagingFinished()) { string message = mMessenger.getMessage(); if (message != null) { try { DocumentCorpus dc = new DocumentCorpus(); XmlReader reader = new XmlTextReader(new StringReader(message)); dc.ReadXml(reader); reader.Close(); DispatchData(dc); } catch (Exception e) { mLogger.Error("ZeroMqReceiverComponent", e); //File.WriteAllText(@"C:\Users\Administrator\Desktop\err\" + Guid.NewGuid().ToString("N") + ".xml", message, Encoding.UTF8); } } Thread.Sleep(1); } } )); mStopped = false; mThread.Start(); } }
public static List<String> convertFIRSTXMLtoGATE(String file) { List<String> documents = new List<string>(); // conversion from FIRST XML to GATE XML // load FIRST XML corpus DocumentCorpus corpus = new DocumentCorpus(); corpus.ReadXml(new XmlTextReader(new StreamReader(file))); // save documents as GATE XML XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; xmlSettings.Encoding = Encoding.UTF8; //int i = 0; foreach (Document doc in corpus.Documents) { StringBuilder docXML = new StringBuilder(); XmlWriter writer = XmlWriter.Create(docXML, xmlSettings); doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true); //doc.WriteXml(writer, /*writeTopElement=*/true); String docstr = docXML.ToString(); documents.Add(docstr); writer.Close(); } return documents; }
static void Main(string[] args) { // conversion from FIRST XML to GATE XML // load FIRST XML corpus DocumentCorpus corpus = new DocumentCorpus(); corpus.ReadXml(new XmlTextReader(new StreamReader(@"D:\streamer\files\23_55_08_450f24c0969d49d2883fc17a6f4e2af0.xml"))); // save documents as GATE XML XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; xmlSettings.Encoding = Encoding.UTF8; int i = 0; foreach (Document doc in corpus.Documents) { StreamWriter streamWriter = new StreamWriter(string.Format(@"D:\streamer\output\{0}.xml", ++i)); XmlWriter writer = XmlWriter.Create(streamWriter, xmlSettings); doc.WriteGateXml(writer, /*writeTopElement=*/true, /*removeBoilerplate=*/true); //doc.WriteXml(writer, /*writeTopElement=*/true); String docstr = writer.ToString(); writer.Close(); streamWriter.Close(); } Console.ReadKey(); }
public string GetDoc(string corpusId, string docId, string format, bool rmvRaw, bool changesOnly, string corpusTime) { string dataPath = Utils.GetConfigValue("DataPath", "."); if (corpusId == null || corpusId.Replace("-", "").Length != 32) { return "*** Invalid corpus ID."; } corpusId = corpusId.Replace("-", ""); if (docId == null || docId.Replace("-", "").Length != 32) { return "*** Invalid document ID."; } docId = docId.Replace("-", ""); string[] fileNames = null; if (!string.IsNullOrEmpty(corpusTime)) { try { DateTime dt = DateTime.Parse(corpusTime); string prefix = dt.ToString("HH_mm_ss_"); string path = "\\" + dt.Year + "\\" + dt.Month + "\\" + dt.Day + "\\"; string fileName = dataPath.TrimEnd('\\') + path + prefix + corpusId + ".xml"; if (!Utils.VerifyFileNameOpen(fileName)) { return "*** Corpus not found."; } fileNames = new string[] { fileName }; } catch { return "*** Unable to parse time."; } } if (fileNames == null) { fileNames = Directory.GetFiles(dataPath, "*" + corpusId + ".xml", SearchOption.AllDirectories); } if (fileNames.Length == 0) { return "*** Corpus not found."; } DocumentCorpus corpus = new DocumentCorpus(); StreamReader reader = new StreamReader(fileNames[0]); XmlTextReader xmlReader = new XmlTextReader(reader); corpus.ReadXml(xmlReader); xmlReader.Close(); reader.Close(); Document document = null; foreach (Document doc in corpus.Documents) { if (new Guid(doc.Features.GetFeatureValue("guid")).ToString("N") == docId) { document = doc; break; } } if (document == null) { return "*** Document not found."; } if (rmvRaw) { document.Features.RemoveFeature("raw"); } string response; if (format == "html") { StringWriter writer = new StringWriter(); document.MakeHtmlPage(writer, /*inlineCss=*/true); string html = new Regex(@"<!--back_button-->.*?<!--/back_button-->").Replace(writer.ToString(), ""); response = html; } else if (format == "txt") { StringBuilder txt = new StringBuilder(); string selector = "TextBlock/Content"; if (changesOnly && document.Features.GetFeatureValue("rev") != "1") { selector = "TextBlock/Content/Unseen"; } foreach (TextBlock block in document.GetAnnotatedBlocks(selector)) { txt.AppendLine(block.Text); } response = document.Name + "\r\n\r\n" + txt.ToString(); } else { StringWriter writer = new StringWriter(); XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Indent = true; xmlSettings.NewLineOnAttributes = true; xmlSettings.CheckCharacters = false; XmlWriter xmlWriter = XmlWriter.Create(writer, xmlSettings); if (format == "gate_xml") { document.WriteGateXml(xmlWriter, /*writeTopElement=*/true, /*removeBoilerplate=*/true); xmlWriter.Flush(); response = writer.ToString(); } else // xml { document.WriteXml(xmlWriter, /*writeTopElement=*/true); xmlWriter.Flush(); response = writer.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-16\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>"); } xmlWriter.Close(); } return response; }
protected override object ProduceData() { // are we done? if (mCurrentDirIdx >= mDataDirs.Length) { Stop(); return(null); } // do we need to get more files? if (mFiles == null) { mFiles = Directory.GetFiles(mDataDirs[mCurrentDirIdx], "*.xml"); Array.Sort(mFiles); } // did we process all currently available files? if (mCurrentFileIdx >= mFiles.Length) { mFiles = null; mCurrentFileIdx = 0; mCurrentDirIdx++; return(null); } try { // read next file mLogger.Info("ProduceData", "Reading " + mFiles[mCurrentFileIdx] + " ..."); DocumentCorpus corpus = new DocumentCorpus(); StreamReader reader = new StreamReader(mFiles[mCurrentFileIdx]); corpus.ReadXml(new XmlTextReader(reader)); //string fileName = new FileInfo(mFiles[mCurrentFileIdx]).Name; //string corpusId = new Guid(fileName.Split('_', '.')[3]).ToString(); //corpus.Features.SetFeatureValue("guid", corpusId); reader.Close(); // refresh corpus ID (to avoid conflicts) corpus.Features.SetFeatureValue("guid", Guid.NewGuid().ToString()); // remove underscores in feature names string[] tmp = new string[corpus.Features.Names.Count]; corpus.Features.Names.CopyTo(tmp, /*index=*/ 0); foreach (string featureName in tmp) { if (featureName.StartsWith("_")) { corpus.Features.SetFeatureValue(featureName.TrimStart('_'), corpus.Features.GetFeatureValue(featureName)); corpus.Features.RemoveFeature(featureName); } } foreach (Document doc in corpus.Documents) { // remove annotations doc.ClearAnnotations(); // remove underscores in feature names tmp = new string[doc.Features.Names.Count]; doc.Features.Names.CopyTo(tmp, /*index=*/ 0); foreach (string featureName in tmp) { if (featureName.StartsWith("_")) { doc.Features.SetFeatureValue(featureName.TrimStart('_'), doc.Features.GetFeatureValue(featureName)); doc.Features.RemoveFeature(featureName); } } // remove processing-specific features foreach (string featureName in new string[] { "detectedLanguage", "detectedCharRange", "bprBoilerplateCharCount", "bprContentCharCount", "domainName", "urlKey", "rev", "blacklisted" }) { doc.Features.RemoveFeature(featureName); } // if there's raw data available, reset the content string raw = doc.Features.GetFeatureValue("raw"); if (raw != null) { doc.Features.SetFeatureValue("contentType", "Html"); doc.Text = GetEncoding(doc.Features.GetFeatureValue("charSet")).GetString(Convert.FromBase64String(raw)); } } mCurrentFileIdx++; while (WorkflowUtils.GetBranchLoadMax(this) > 10) // I'm giving it all she's got, Captain! { Thread.Sleep(1000); } return(corpus); } catch (Exception e) { mCurrentFileIdx++; throw e; } }