public Corpus GetCorpusFromText(IEnumerable<string> fileNames, IDocumentTextReader reader) { Corpus cor = new Corpus(Encoding.UTF8); foreach (var file in fileNames) { try { using (Stream stream = new FileStream(file, FileMode.Open, FileAccess.Read)) { WEMDocument[] docs = reader.ReadDocuments(stream); foreach (var doc in docs) { cor.AddDocument(doc); } } } catch { continue; } } return cor; }
public Corpus GetCorpusFromXml(IEnumerable<string> fileNames, Encoding outputEncoding) { Corpus cor = new Corpus(outputEncoding); foreach (var file in fileNames) { cor.AddDocument( new WEMDocument(cor.Encoding).Load(file)); } return cor; }
public void Start() { if (startCheck()) { AnnotationSet wemas; ICTCLASAnnotator ictclasAno; CRFPPAnnotator crfppAno; Corpus corpus; Console.BackgroundColor = ConsoleColor.DarkBlue; Console.Clear(); writeTitle(); ConsoleColor color = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Green; try { Console.WriteLine(MessageInitializing); wemas = new AnnotationSet(); wemas.Load(_annotationSet); ictclasAno = new ICTCLASAnnotator(wemas, null); crfppAno = new CRFPPAnnotator(wemas, new Model(_crfppModel)); crfppAno.SetCRFPPRootPath(_crfppDir); corpus = new Corpus(Encoding.GetEncoding(_xmlEncoding)); Console.Write(MessageLoadingCorpus); savePosition(); var documents = from file in Directory.GetFiles(_corpusDir) where file.ToUpper().EndsWith(".TXT") select file; if (documents.Count() == 0) { Console.WriteLine(); Console.WriteLine(MessageEmptyCorpus); Console.WriteLine(MessageDone); Console.ForegroundColor = color; pause(); return; } _sentenceFactory = new SentenceFactory(wemas); _sentenceFactory.InputLanguage = Language.SimplifiedChinese; _sentenceFactory.OutputEncoding = corpus.Encoding; int count = 0; foreach (var doc in documents) { Sentence[] sens = _sentenceFactory.GetSentences( new FileStream(doc, FileMode.Open)); WEMDocument wDoc = new WEMDocument(corpus.Encoding); wDoc.Name = doc; foreach (var sen in sens) { wDoc.AddSentence(sen); } corpus.AddDocument(wDoc); rewrite(String.Format("{0}%", (int)((double) ++count / documents.Count() * 100))); } Console.WriteLine("共加载{0}篇文档。", corpus.Documents.Count); Console.Write(MessageAnnotatingSeg); savePosition(); bool done = false; ictclasAno.AnnotationProgressChanged += (s, ea) => { rewrite(String.Format("{0}%", Math.Round(ea.ProgressInDouble, 2) * 100)); }; ictclasAno.AnnotationCompleted += (s, ea) => { if (ea.Error == null) { rewrite("100%"); Console.WriteLine(MessageAnnotatingEnt); crfppAno.AnnotationProgressChanged += (sen, args) => { Console.WriteLine(args.Message); }; crfppAno.AnnotationCompleted += (sen, args) => { if (args.Error == null) { Console.WriteLine( MessageAnnotationFinished); if (_reportsEnabled) Console.Write(MessageSavingDocsAndReps); else Console.Write(MessageSavingDocuments); savePosition(); int sCount = 0; foreach (var doc in corpus.Documents) { string reportFileName; string fileName = documents.ElementAt( corpus.Documents.IndexOf(doc)); FileInfo fi = new FileInfo(fileName); reportFileName = fileName = fi.Name; fileName = fileName.Remove( fileName.Length - 3); fileName = _outputDir + fileName + "xml"; doc.Save(fileName); if (_reportsEnabled) saveReport(doc, _reportsDir + reportFileName, wemas); rewrite(String.Format("{0}%", (int)((double)++sCount / corpus.Documents.Count * 100))); } Console.WriteLine(MessageDone); done = true; } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(args.Error.Message); Console.WriteLine(args.Error.StackTrace); done = true; } }; if (crfppAno.Initialize()) { crfppAno.ProcessCorpusAsync(corpus); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(MessageEntInitFailed); done = true; return; } } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(ea.Error.Message); Console.WriteLine(ea.Error.StackTrace); done = true; } }; if (ictclasAno.Initialize()) { ictclasAno.ProcessCorpusAsync(corpus); while (!done) Thread.Sleep(50); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(MessageSegInitFailed); } } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(ex.Message); } Console.ForegroundColor = color; //pause(); return; } else { //(); return; } }