protected override void AnnotateCorpus(Corpus corpus, System.Threading.CancellationToken token) { Encoding enc = corpus.Encoding; int index = 0; foreach (var doc in corpus.Documents) { if (token.IsCancellationRequested) break; foreach (var sen in doc.Sentences) { sen.ClearWords(); ResultTerm[] annos = _ictclas.Segment(sen.Content, enc); foreach (var anno in annos) { sen.AnnotateWord( new Word(anno.Start, anno.Length, anno.ClassID)); } } OnAnnotationProgressChanged( new AnnotationProgressChangedEventArgs( ++index, corpus.Documents.Count)); } }
public Corpus GetCorpusFromText(IEnumerable<string> fileNames, IDocumentTextReader reader, Encoding outputEncoding) { Corpus cor = new Corpus(Encoding.UTF8); foreach (var file in fileNames) { try { using (Stream stream = new FileStream(file, FileMode.Open, FileAccess.Read)) { WEMDocument[] docs = reader.ReadDocuments(stream, outputEncoding); foreach (var doc in docs) { cor.AddDocument(doc); } } } catch { continue; } } return cor; }
protected override void AnnotateCorpus( Corpus corpus, System.Threading.CancellationToken token) { token.ThrowIfCancellationRequested(); Guid taskId = Guid.NewGuid(); DirectoryInfo dir = new DirectoryInfo( AppDomain.CurrentDomain.BaseDirectory); dir = dir.CreateSubdirectory("Tasks"); dir = dir.CreateSubdirectory(taskId.ToString()); string dataFile = dir.FullName + "\\Corpus.crfppdata"; string outputFile = dir.FullName + "\\Output.crfppdata"; try { AnnotationProgressChangedEventArgs ePrepared = new AnnotationProgressChangedEventArgs( 1, 3, MessagePreparing); OnAnnotationProgressChanged(ePrepared); CRFPPHelper.EncodeCorpusToCRFPPData(corpus, dataFile); token.ThrowIfCancellationRequested(); AnnotationProgressChangedEventArgs eAnnotating = new AnnotationProgressChangedEventArgs( 2, 3, MessageAnnotating); OnAnnotationProgressChanged(eAnnotating); CRFPPHelper.Annotate( Model.RootPath, dataFile, outputFile, 1, 0); token.ThrowIfCancellationRequested(); AnnotationProgressChangedEventArgs eFinishing = new AnnotationProgressChangedEventArgs( 3, 3, MessageFinishing); OnAnnotationProgressChanged(eFinishing); CRFPPHelper.DecodeCorpusFromCRFPPData(corpus, outputFile); token.ThrowIfCancellationRequested(); } catch { throw; } finally { File.Delete(dataFile); File.Delete(outputFile); try { dir.Delete(); } catch { } } }
public Corpus GetCorpusFromXml(IEnumerable<string> fileNames, Encoding outputEncoding) { Corpus cor = new Corpus(outputEncoding); foreach (var file in fileNames) { cor.AddDocument( new WEMDocument(cor.Encoding).Load(file)); } return cor; }
public void Start() { if (startCheck()) { AnnotationSet wemas; ICTCLASAnnotator ictclasAno; CRFPPAnnotator crfppAno; Corpus corpus; Console.BackgroundColor = ConsoleColor.DarkBlue; Console.Clear(); writeTitle(); ConsoleColor color = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Green; try { Console.WriteLine(MessageInitializing); wemas = new AnnotationSet(); wemas.Load(_annotationSet); ictclasAno = new ICTCLASAnnotator(wemas, null); crfppAno = new CRFPPAnnotator(wemas, new Model(_crfppModel)); crfppAno.SetCRFPPRootPath(_crfppDir); corpus = new Corpus(Encoding.GetEncoding(_xmlEncoding)); Console.Write(MessageLoadingCorpus); savePosition(); var documents = from file in Directory.GetFiles(_corpusDir) where file.ToUpper().EndsWith(".TXT") select file; if (documents.Count() == 0) { Console.WriteLine(); Console.WriteLine(MessageEmptyCorpus); Console.WriteLine(MessageDone); Console.ForegroundColor = color; pause(); return; } _sentenceFactory = new SentenceFactory(wemas); _sentenceFactory.InputLanguage = Language.SimplifiedChinese; _sentenceFactory.OutputEncoding = corpus.Encoding; int count = 0; foreach (var doc in documents) { Sentence[] sens = _sentenceFactory.GetSentences( new FileStream(doc, FileMode.Open)); WEMDocument wDoc = new WEMDocument(corpus.Encoding); wDoc.Name = doc; foreach (var sen in sens) { wDoc.AddSentence(sen); } corpus.AddDocument(wDoc); rewrite(String.Format("{0}%", (int)((double) ++count / documents.Count() * 100))); } Console.WriteLine("共加载{0}篇文档。", corpus.Documents.Count); Console.Write(MessageAnnotatingSeg); savePosition(); bool done = false; ictclasAno.AnnotationProgressChanged += (s, ea) => { rewrite(String.Format("{0}%", Math.Round(ea.ProgressInDouble, 2) * 100)); }; ictclasAno.AnnotationCompleted += (s, ea) => { if (ea.Error == null) { rewrite("100%"); Console.WriteLine(MessageAnnotatingEnt); crfppAno.AnnotationProgressChanged += (sen, args) => { Console.WriteLine(args.Message); }; crfppAno.AnnotationCompleted += (sen, args) => { if (args.Error == null) { Console.WriteLine( MessageAnnotationFinished); if (_reportsEnabled) Console.Write(MessageSavingDocsAndReps); else Console.Write(MessageSavingDocuments); savePosition(); int sCount = 0; foreach (var doc in corpus.Documents) { string reportFileName; string fileName = documents.ElementAt( corpus.Documents.IndexOf(doc)); FileInfo fi = new FileInfo(fileName); reportFileName = fileName = fi.Name; fileName = fileName.Remove( fileName.Length - 3); fileName = _outputDir + fileName + "xml"; doc.Save(fileName); if (_reportsEnabled) saveReport(doc, _reportsDir + reportFileName, wemas); rewrite(String.Format("{0}%", (int)((double)++sCount / corpus.Documents.Count * 100))); } Console.WriteLine(MessageDone); done = true; } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(args.Error.Message); Console.WriteLine(args.Error.StackTrace); done = true; } }; if (crfppAno.Initialize()) { crfppAno.ProcessCorpusAsync(corpus); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(MessageEntInitFailed); done = true; return; } } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(ea.Error.Message); Console.WriteLine(ea.Error.StackTrace); done = true; } }; if (ictclasAno.Initialize()) { ictclasAno.ProcessCorpusAsync(corpus); while (!done) Thread.Sleep(50); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(MessageSegInitFailed); } } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(); Console.WriteLine(MessageUnhandledException); Console.WriteLine(ex.Message); } Console.ForegroundColor = color; //pause(); return; } else { //(); return; } }
/// <summary> /// 在一个文档集上训练。 /// </summary> /// <param name="corpus">训练集</param> /// <returns>训练得到的统计模块</returns> public Model Train(Corpus corpus) { if (_isRunning) throw new InvalidOperationException(MessageInvalidOperation); if (Initialize()) { return LearnCorpus(corpus, CancellationToken.None); } else { throw new ApplicationException(MessageInitializeFailed); } }
/// <summary> /// 在文档集上训练的抽象方法。 /// </summary> /// <param name="corpus">训练集</param> /// <returns>训练得到的统计模块</returns> /// <remarks> /// 该方法应该是同步完成的。 /// </remarks> protected abstract Model LearnCorpus(Corpus corpus, CancellationToken token);
/// <summary> /// 异步在一个文档集上训练。 /// </summary> /// <param name="corpus">训练集</param> public void TrainAsync(Corpus corpus) { if (_isRunning) throw new InvalidOperationException(MessageInvalidOperation); if (Initialize()) { _isRunning = true; _cts = new CancellationTokenSource(); Task task = Task.Factory.StartNew(() => { try { Model model = LearnCorpus(corpus, _cts.Token); TrainingCompletedEventArgs ea = new TrainingCompletedEventArgs( model, null, false, null); OnTrainingCompleted(ea); _isRunning = false; } catch (Exception ex) { TrainingCompletedEventArgs ea = new TrainingCompletedEventArgs( null, ex, false, null); OnTrainingCompleted(ea); _isRunning = false; } }, _cts.Token); } else { throw new ApplicationException(MessageInitializeFailed); } }
/// <summary> /// 对文档集进行标注的抽象方法。 /// </summary> /// <param name="corpus">文档集</param> /// <remarks> /// 该方法应该是同步完成的。 /// </remarks> protected abstract void AnnotateCorpus(Corpus corpus, CancellationToken token);
/// <summary> /// 异步处理一个文档集。 /// </summary> /// <param name="corpus">文档集</param> public void ProcessCorpusAsync(Corpus corpus) { if (_isRunning) throw new InvalidOperationException(MessageInvalidOperation); if (Initialize()) { _isRunning = true; _cts = new CancellationTokenSource(); Task task = Task.Factory.StartNew(() => { try { AnnotateCorpus(corpus, _cts.Token); AsyncCompletedEventArgs ea = new AsyncCompletedEventArgs(null, false, null); OnAnnotationCompleted(ea); _isRunning = false; } catch (Exception ex) { AsyncCompletedEventArgs ea = new AsyncCompletedEventArgs(ex, false, null); OnAnnotationCompleted(ea); _isRunning = false; } }, _cts.Token); } else { throw new ApplicationException(MessageInitializeFailed); } }
/// <summary> /// 处理一个文档集。 /// </summary> /// <param name="corpus">文档集</param> public void ProcessCorpus(Corpus corpus) { if (_isRunning) throw new InvalidOperationException(MessageInvalidOperation); if (Initialize()) { AnnotateCorpus(corpus, CancellationToken.None); } else { throw new ApplicationException(MessageInitializeFailed); } }