示例#1
0
        protected override void AnnotateCorpus(Corpus corpus,
            System.Threading.CancellationToken token)
        {
            Encoding enc = corpus.Encoding;
            int index = 0;
            foreach (var doc in corpus.Documents)
            {
                if (token.IsCancellationRequested)
                    break;

                foreach (var sen in doc.Sentences)
                {
                    sen.ClearWords();
                    ResultTerm[] annos = _ictclas.Segment(sen.Content, enc);

                    foreach (var anno in annos)
                    {
                        sen.AnnotateWord(
                            new Word(anno.Start, anno.Length, anno.ClassID));
                    }
                }

                OnAnnotationProgressChanged(
                    new AnnotationProgressChangedEventArgs(
                        ++index, corpus.Documents.Count));
            }
        }
示例#2
0
        public Corpus GetCorpusFromText(IEnumerable<string> fileNames,
            IDocumentTextReader reader, Encoding outputEncoding)
        {
            Corpus cor = new Corpus(Encoding.UTF8);

            foreach (var file in fileNames)
            {
                try
                {
                    using (Stream stream = new FileStream(file, FileMode.Open,
                                FileAccess.Read))
                    {
                        WEMDocument[] docs =
                            reader.ReadDocuments(stream, outputEncoding);

                        foreach (var doc in docs)
                        {
                            cor.AddDocument(doc);
                        }
                    }
                }
                catch
                {
                    continue;
                }
            }

            return cor;
        }
示例#3
0
        protected override void AnnotateCorpus(
            Corpus corpus, System.Threading.CancellationToken token)
        {
            token.ThrowIfCancellationRequested();

            Guid taskId = Guid.NewGuid();

            DirectoryInfo dir = new DirectoryInfo(
                AppDomain.CurrentDomain.BaseDirectory);

            dir = dir.CreateSubdirectory("Tasks");
            dir = dir.CreateSubdirectory(taskId.ToString());

            string dataFile = dir.FullName + "\\Corpus.crfppdata";
            string outputFile = dir.FullName + "\\Output.crfppdata";

            try
            {
                AnnotationProgressChangedEventArgs ePrepared =
                    new AnnotationProgressChangedEventArgs(
                        1, 3, MessagePreparing);
                OnAnnotationProgressChanged(ePrepared);

                CRFPPHelper.EncodeCorpusToCRFPPData(corpus, dataFile);
                token.ThrowIfCancellationRequested();

                AnnotationProgressChangedEventArgs eAnnotating =
                    new AnnotationProgressChangedEventArgs(
                        2, 3, MessageAnnotating);
                OnAnnotationProgressChanged(eAnnotating);

                CRFPPHelper.Annotate(
                    Model.RootPath, dataFile, outputFile, 1, 0);
                token.ThrowIfCancellationRequested();

                AnnotationProgressChangedEventArgs eFinishing =
                    new AnnotationProgressChangedEventArgs(
                        3, 3, MessageFinishing);
                OnAnnotationProgressChanged(eFinishing);

                CRFPPHelper.DecodeCorpusFromCRFPPData(corpus, outputFile);
                token.ThrowIfCancellationRequested();
            }
            catch
            {
                throw;
            }
            finally
            {
                File.Delete(dataFile);
                File.Delete(outputFile);

                try
                {
                    dir.Delete();
                }
                catch { }
            }
        }
示例#4
0
        public Corpus GetCorpusFromXml(IEnumerable<string> fileNames,
            Encoding outputEncoding)
        {
            Corpus cor = new Corpus(outputEncoding);

            foreach (var file in fileNames)
            {
                cor.AddDocument(
                    new WEMDocument(cor.Encoding).Load(file));
            }

            return cor;
        }
示例#5
0
文件: Program.cs 项目: Ran-QUAN/Alta
        public void Start()
        {
            if (startCheck())
            {
                AnnotationSet wemas;
                ICTCLASAnnotator ictclasAno;
                CRFPPAnnotator crfppAno;
                Corpus corpus;

                Console.BackgroundColor = ConsoleColor.DarkBlue;
                Console.Clear();
                writeTitle();

                ConsoleColor color = Console.ForegroundColor;
                Console.ForegroundColor = ConsoleColor.Green;

                try
                {
                    Console.WriteLine(MessageInitializing);

                    wemas = new AnnotationSet();
                    wemas.Load(_annotationSet);

                    ictclasAno = new ICTCLASAnnotator(wemas, null);
                    crfppAno = new CRFPPAnnotator(wemas,
                        new Model(_crfppModel));
                    crfppAno.SetCRFPPRootPath(_crfppDir);

                    corpus = new Corpus(Encoding.GetEncoding(_xmlEncoding));

                    Console.Write(MessageLoadingCorpus);
                    savePosition();

                    var documents = from file in Directory.GetFiles(_corpusDir)
                                    where file.ToUpper().EndsWith(".TXT")
                                    select file;

                    if (documents.Count() == 0)
                    {
                        Console.WriteLine();
                        Console.WriteLine(MessageEmptyCorpus);
                        Console.WriteLine(MessageDone);
                        Console.ForegroundColor = color;
                        pause();
                        return;
                    }

                    _sentenceFactory
                        = new SentenceFactory(wemas);
                    _sentenceFactory.InputLanguage
                        = Language.SimplifiedChinese;
                    _sentenceFactory.OutputEncoding = corpus.Encoding;

                    int count = 0;
                    foreach (var doc in documents)
                    {
                        Sentence[] sens = _sentenceFactory.GetSentences(
                            new FileStream(doc, FileMode.Open));

                        WEMDocument wDoc = new WEMDocument(corpus.Encoding);
                        wDoc.Name = doc;

                        foreach (var sen in sens)
                        {
                            wDoc.AddSentence(sen);
                        }

                        corpus.AddDocument(wDoc);

                        rewrite(String.Format("{0}%", (int)((double)
                        ++count / documents.Count() * 100)));
                    }

                    Console.WriteLine("共加载{0}篇文档。",
                        corpus.Documents.Count);
                    Console.Write(MessageAnnotatingSeg);
                    savePosition();

                    bool done = false;

                    ictclasAno.AnnotationProgressChanged += (s, ea) =>
                        {
                            rewrite(String.Format("{0}%",
                                Math.Round(ea.ProgressInDouble, 2) * 100));
                        };

                    ictclasAno.AnnotationCompleted += (s, ea) =>
                        {
                            if (ea.Error == null)
                            {
                                rewrite("100%");
                                Console.WriteLine(MessageAnnotatingEnt);

                                crfppAno.AnnotationProgressChanged += (sen, args) =>
                                    {
                                        Console.WriteLine(args.Message);
                                    };

                                crfppAno.AnnotationCompleted += (sen, args) =>
                                    {
                                        if (args.Error == null)
                                        {
                                            Console.WriteLine(
                                                                            MessageAnnotationFinished);
                                            if (_reportsEnabled)
                                                Console.Write(MessageSavingDocsAndReps);
                                            else
                                                Console.Write(MessageSavingDocuments);
                                            savePosition();

                                            int sCount = 0;
                                            foreach (var doc in corpus.Documents)
                                            {
                                                string reportFileName;

                                                string fileName = documents.ElementAt(
                                                    corpus.Documents.IndexOf(doc));

                                                FileInfo fi = new FileInfo(fileName);
                                                reportFileName = fileName = fi.Name;

                                                fileName = fileName.Remove(
                                                    fileName.Length - 3);
                                                fileName = _outputDir + fileName
                                                    + "xml";

                                                doc.Save(fileName);

                                                if (_reportsEnabled)
                                                    saveReport(doc,
                                                        _reportsDir + reportFileName,
                                                        wemas);

                                                rewrite(String.Format("{0}%",
                                                    (int)((double)++sCount
                                                    / corpus.Documents.Count * 100)));
                                            }

                                            Console.WriteLine(MessageDone);
                                            done = true;
                                        }
                                        else
                                        {
                                            Console.ForegroundColor = ConsoleColor.Red;
                                            Console.WriteLine();
                                            Console.WriteLine(MessageUnhandledException);
                                            Console.WriteLine(args.Error.Message);
                                            Console.WriteLine(args.Error.StackTrace);
                                            done = true;
                                        }
                                    };

                                if (crfppAno.Initialize())
                                {
                                    crfppAno.ProcessCorpusAsync(corpus);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine(MessageEntInitFailed);
                                    done = true;
                                    return;
                                }
                            }
                            else
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine();
                                Console.WriteLine(MessageUnhandledException);
                                Console.WriteLine(ea.Error.Message);
                                Console.WriteLine(ea.Error.StackTrace);
                                done = true;
                            }
                        };

                    if (ictclasAno.Initialize())
                    {
                        ictclasAno.ProcessCorpusAsync(corpus);

                        while (!done)
                            Thread.Sleep(50);
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Red;
                        Console.WriteLine(MessageSegInitFailed);
                    }
                }
                catch (Exception ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine();
                    Console.WriteLine(MessageUnhandledException);
                    Console.WriteLine(ex.Message);
                }

                Console.ForegroundColor = color;
                //pause();
                return;
            }
            else
            {
                //();
                return;
            }
        }
示例#6
0
        /// <summary>
        /// 在一个文档集上训练。
        /// </summary>
        /// <param name="corpus">训练集</param>
        /// <returns>训练得到的统计模块</returns>
        public Model Train(Corpus corpus)
        {
            if (_isRunning)
                throw new InvalidOperationException(MessageInvalidOperation);

            if (Initialize())
            {
               return LearnCorpus(corpus, CancellationToken.None);
            }
            else
            {
                throw new ApplicationException(MessageInitializeFailed);
            }
        }
示例#7
0
 /// <summary>
 /// 在文档集上训练的抽象方法。
 /// </summary>
 /// <param name="corpus">训练集</param>
 /// <returns>训练得到的统计模块</returns>
 /// <remarks>
 /// 该方法应该是同步完成的。
 /// </remarks>
 protected abstract Model LearnCorpus(Corpus corpus,
     CancellationToken token);
示例#8
0
        /// <summary>
        /// 异步在一个文档集上训练。
        /// </summary>
        /// <param name="corpus">训练集</param>
        public void TrainAsync(Corpus corpus)
        {
            if (_isRunning)
                throw new InvalidOperationException(MessageInvalidOperation);

            if (Initialize())
            {
                _isRunning = true;
                _cts = new CancellationTokenSource();

                Task task = Task.Factory.StartNew(() =>
                {
                    try
                    {
                        Model model = LearnCorpus(corpus, _cts.Token);

                        TrainingCompletedEventArgs ea =
                            new TrainingCompletedEventArgs(
                                model, null, false, null);
                        OnTrainingCompleted(ea);
                        _isRunning = false;
                    }
                    catch (Exception ex)
                    {
                        TrainingCompletedEventArgs ea =
                            new TrainingCompletedEventArgs(
                                null, ex, false, null);
                        OnTrainingCompleted(ea);
                        _isRunning = false;
                    }
                }, _cts.Token);
            }
            else
            {
                throw new ApplicationException(MessageInitializeFailed);
            }
        }
示例#9
0
 /// <summary>
 /// 对文档集进行标注的抽象方法。
 /// </summary>
 /// <param name="corpus">文档集</param>
 /// <remarks>
 /// 该方法应该是同步完成的。
 /// </remarks>
 protected abstract void AnnotateCorpus(Corpus corpus,
     CancellationToken token);
示例#10
0
        /// <summary>
        /// 异步处理一个文档集。
        /// </summary>
        /// <param name="corpus">文档集</param>
        public void ProcessCorpusAsync(Corpus corpus)
        {
            if (_isRunning)
                throw new InvalidOperationException(MessageInvalidOperation);

            if (Initialize())
            {
                _isRunning = true;
                _cts       = new CancellationTokenSource();

                Task task = Task.Factory.StartNew(() =>
                {
                    try
                    {
                        AnnotateCorpus(corpus, _cts.Token);

                        AsyncCompletedEventArgs ea =
                            new AsyncCompletedEventArgs(null, false, null);

                        OnAnnotationCompleted(ea);
                        _isRunning = false;
                    }
                    catch (Exception ex)
                    {
                        AsyncCompletedEventArgs ea =
                            new AsyncCompletedEventArgs(ex, false, null);

                        OnAnnotationCompleted(ea);
                        _isRunning = false;
                    }
                }, _cts.Token);
            }
            else
            {
                throw new ApplicationException(MessageInitializeFailed);
            }
        }
示例#11
0
        /// <summary>
        /// 处理一个文档集。
        /// </summary>
        /// <param name="corpus">文档集</param>
        public void ProcessCorpus(Corpus corpus)
        {
            if (_isRunning)
                throw new InvalidOperationException(MessageInvalidOperation);

            if (Initialize())
            {
                AnnotateCorpus(corpus, CancellationToken.None);
            }
            else
            {
                throw new ApplicationException(MessageInitializeFailed);
            }
        }