static void saveReport(WEMDocument document, string fileName, AnnotationSet annotationSet) { using (StreamWriter writer = new StreamWriter( new FileStream(fileName, FileMode.OpenOrCreate))) { foreach (var sen in document.Sentences) { foreach (var word in sen.Words) { if (word is Entity) { Entity entity = (Entity)word; writer.Write(String.Format("[{0} \\ {1}] ", entity.Content, annotationSet[entity.EntityId])); } else { writer.Write(word.Content + ' '); } } writer.WriteLine(); } writer.Close(); } }
/// <summary> /// 向文档集中添加一个文档。 /// </summary> /// <param name="doc">要添加的文档</param> public void AddDocument(WEMDocument doc) { if (doc == null) throw new ArgumentNullException("doc"); if (((IEncodingConvertible)doc).Encoding != this.Encoding) ((IEncodingConvertible)doc).ChangeEncoding(this.Encoding); _documents.Add(doc); }
public WEMDocument[] ReadDocuments(Stream textStream) { OutputEncoding = Encoding.UTF8; Sentence[] sens = _sententceFactory.GetSentences(textStream); WEMDocument doc = new WEMDocument(OutputEncoding); foreach (var sen in sens) { doc.AddSentence(sen); } doc.ApplicationId = AnnotationSet.Id; return new WEMDocument[] { doc }; }
public void Start() { if (startCheck()) { AnnotationSet wemas; ICTCLASAnnotator ictclasAno; CRFPPAnnotator crfppAno; Corpus corpus; Console.BackgroundColor = ConsoleColor.DarkBlue; Console.Clear(); writeTitle(); ConsoleColor color = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Green; try { Console.WriteLine(MessageInitializing); wemas = new AnnotationSet(); wemas.Load(_annotationSet); ictclasAno = new ICTCLASAnnotator(wemas, null); crfppAno = new CRFPPAnnotator(wemas, new Model(_crfppModel)); crfppAno.SetCRFPPRootPath(_crfppDir); corpus = new Corpus(Encoding.GetEncoding(_xmlEncoding)); Console.Write(MessageLoadingCorpus); savePosition(); var documents = from file in Directory.GetFiles(_corpusDir) where file.ToUpper().EndsWith(".TXT") select file; if (documents.Count() == 0) { Console.WriteLine(); Console.WriteLine(MessageEmptyCorpus); Console.WriteLine(MessageDone); Console.ForegroundColor = color; pause(); return; } _sentenceFactory = new SentenceFactory(wemas); _sentenceFactory.InputLanguage = Language.SimplifiedChinese; _sentenceFactory.OutputEncoding = corpus.Encoding; int count = 0; foreach (var doc in documents) { Sentence[] sens = _sentenceFactory.GetSentences( new FileStream(doc, FileMode.Open)); WEMDocument wDoc = new WEMDocument(corpus.Encoding); wDoc.Name = doc; foreach (var sen in sens) { wDoc.AddSentence(sen); } corpus.AddDocument(wDoc); rewrite(String.Format("{0}%", (int)((double) ++count / documents.Count() * 100))); } Console.WriteLine("共加载{0}篇文档。", corpus.Documents.Count); Console.Write(MessageAnnotatingSeg); savePosition(); bool done = false; ictclasAno.AnnotationProgressChanged += (s, ea) => { rewrite(String.Format("{0}%", Math.Round(ea.ProgressInDouble, 2) * 100)); }; ictclasAno.AnnotationCompleted += (s, ea) => { if (ea.Error == null) { rewrite("100%"); Console.WriteLine(MessageAnnotatingEnt); crfppAno.AnnotationProgressChanged += (sen, args) => { Console.WriteLine(args.Message); }; crfppAno.AnnotationCompleted += (sen, args) => { if (args.Error == null) { Console.WriteLine( MessageAnnotationFinished); if (_reportsEnabled) Console.Write(MessageSavingDocsAndReps); else Console.Write(MessageSavingDocuments); savePosition(); int sCount = 0; foreach (var doc in corpus.Documents) { string reportFileName; string fileName = documents.ElementAt( corpus.Documents.IndexOf(doc)); FileInfo fi = new FileInfo(fileName); reportFileName = fileName = fi.Name; fileName = fileName.Remove( fileName.Length - 3); fileName = _outputDir + fileName + "xml"; doc.Save(fileName); if (_reportsEnabled) saveReport(doc, _reportsDir + reportFileName, wemas); rewrite(String.Format("{0}%", (int)((double)++sCount / corpus.Documents.Count * 100))); } Console.WriteLine(MessageDone); done = true; } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(args.Error.Message); Console.WriteLine(args.Error.StackTrace); done = true; } }; if (crfppAno.Initialize()) { crfppAno.ProcessCorpusAsync(corpus); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(MessageEntInitFailed); done = true; return; } } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(ea.Error.Message); Console.WriteLine(ea.Error.StackTrace); done = true; } }; if (ictclasAno.Initialize()) { ictclasAno.ProcessCorpusAsync(corpus); while (!done) Thread.Sleep(50); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(MessageSegInitFailed); } } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(ex.Message); } Console.ForegroundColor = color; //pause(); return; } else { //(); return; } }
/// <summary> /// 从文档集中删除一个文档。 /// </summary> /// <param name="doc">要删除的文档</param> public void RemoveDocument(WEMDocument doc) { _documents.Remove(doc); }
static void ProcessCommand(string command) { string[] blocks = command.Split(); switch (blocks[0].ToLower()) { case "": return; case "exit": if (blocks.Length == 1) _exitFlag = true; else Console.WriteLine("Unknown Parameters."); return; case "clear": if (blocks.Length == 1) Console.Clear(); else Console.WriteLine("Unknown Parameters."); return; case "genas": AnnotationDictionary dict = new AnnotationDictionary(); AnnotationSet ans = new AnnotationSet("WEMAS"); dict.Load("WEMAS.xml"); foreach (var entry in dict._Dictionary) { ans.SetAnnotationDescription(entry.Key, entry.Value); } foreach (var sep in dict.SentenceSeparators) { ans.AddSentenceSeparator(sep); } ans.Save("WEMAS.wemas"); return; case "testas": AnnotationSet ans2 = new AnnotationSet("WEMAS"); ans2.Load("WEMAS.wemas"); ans2.Description = "WEB ENTITY MINER Default Annotation Set."; ans2.Save("WEMAS2.wemas"); return; case "testws": AnnotationSet wemas = new AnnotationSet(); wemas.Load("WEMAS.wemas"); ICTCLASAnnotator ano = new ICTCLASAnnotator(wemas, null); CRFPPAnnotator crfAno = new CRFPPAnnotator(wemas, new Model( AppDomain.CurrentDomain.BaseDirectory + "model.crfppmodel")); Corpus c = new Corpus(Encoding.UTF8); SentenceFactory.AnnotationSet = wemas; SentenceFactory.InputLanguage = Language.SimplifiedChinese; SentenceFactory.OutputEncoding = Encoding.UTF8; var sens = SentenceFactory.GetSentences( new FileStream("utf8.txt", FileMode.Open, FileAccess.Read)); WEMDocument doc = new WEMDocument(Encoding.UTF8); foreach (var sen in sens) { doc.AddSentence(sen); } c.AddDocument(doc); try { ano.ProcessCorpus(c); crfAno.ProcessCorpus(c); } catch (Exception ex) { Console.WriteLine("Unhandled Exception:\n{0}", ex.Message); } doc.Save("RESULT.xml"); foreach (var sententce in doc.Sentences) { foreach (var word in sententce.Words) { if (word is Entity) { Console.WriteLine("{0}/ENTITY:{1}", word.Content, wemas[((Entity)word).EntityId]); } else { Console.WriteLine("{0}", word.Content); } } } return; default: Console.WriteLine("Unknown Command: '{0}'.", blocks[0]); return; } }
public void CreateDocument() { if (!_documents.ContainsKey(DocumentName)) { WEMDocument doc = new WEMDocument(_encoding); doc.Name = DocumentName; _documents.Add(DocumentName, doc); } }