예제 #1
0
        public Corpus GetCorpusFromText(IEnumerable<string> fileNames,
            IDocumentTextReader reader)
        {
            Corpus cor = new Corpus(Encoding.UTF8);

            foreach (var file in fileNames)
            {
                try
                {
                    using (Stream stream = new FileStream(file, FileMode.Open,
                                FileAccess.Read))
                    {
                        WEMDocument[] docs = reader.ReadDocuments(stream);

                        foreach (var doc in docs)
                        {
                            cor.AddDocument(doc);
                        }
                    }
                }
                catch
                {
                    continue;
                }
            }

            return cor;
        }
예제 #2
0
        public Corpus GetCorpusFromXml(IEnumerable<string> fileNames,
            Encoding outputEncoding)
        {
            Corpus cor = new Corpus(outputEncoding);

            foreach (var file in fileNames)
            {
                cor.AddDocument(
                    new WEMDocument(cor.Encoding).Load(file));
            }

            return cor;
        }
예제 #3
0
파일: Program.cs 프로젝트: Ran-QUAN/Alta
        public void Start()
        {
            if (startCheck())
            {
                AnnotationSet wemas;
                ICTCLASAnnotator ictclasAno;
                CRFPPAnnotator crfppAno;
                Corpus corpus;

                Console.BackgroundColor = ConsoleColor.DarkBlue;
                Console.Clear();
                writeTitle();

                ConsoleColor color = Console.ForegroundColor;
                Console.ForegroundColor = ConsoleColor.Green;

                try
                {
                    Console.WriteLine(MessageInitializing);

                    wemas = new AnnotationSet();
                    wemas.Load(_annotationSet);

                    ictclasAno = new ICTCLASAnnotator(wemas, null);
                    crfppAno = new CRFPPAnnotator(wemas,
                        new Model(_crfppModel));
                    crfppAno.SetCRFPPRootPath(_crfppDir);

                    corpus = new Corpus(Encoding.GetEncoding(_xmlEncoding));

                    Console.Write(MessageLoadingCorpus);
                    savePosition();

                    var documents = from file in Directory.GetFiles(_corpusDir)
                                    where file.ToUpper().EndsWith(".TXT")
                                    select file;

                    if (documents.Count() == 0)
                    {
                        Console.WriteLine();
                        Console.WriteLine(MessageEmptyCorpus);
                        Console.WriteLine(MessageDone);
                        Console.ForegroundColor = color;
                        pause();
                        return;
                    }

                    _sentenceFactory
                        = new SentenceFactory(wemas);
                    _sentenceFactory.InputLanguage
                        = Language.SimplifiedChinese;
                    _sentenceFactory.OutputEncoding = corpus.Encoding;

                    int count = 0;
                    foreach (var doc in documents)
                    {
                        Sentence[] sens = _sentenceFactory.GetSentences(
                            new FileStream(doc, FileMode.Open));

                        WEMDocument wDoc = new WEMDocument(corpus.Encoding);
                        wDoc.Name = doc;

                        foreach (var sen in sens)
                        {
                            wDoc.AddSentence(sen);
                        }

                        corpus.AddDocument(wDoc);

                        rewrite(String.Format("{0}%", (int)((double)
                        ++count / documents.Count() * 100)));
                    }

                    Console.WriteLine("共加载{0}篇文档。",
                        corpus.Documents.Count);
                    Console.Write(MessageAnnotatingSeg);
                    savePosition();

                    bool done = false;

                    ictclasAno.AnnotationProgressChanged += (s, ea) =>
                        {
                            rewrite(String.Format("{0}%",
                                Math.Round(ea.ProgressInDouble, 2) * 100));
                        };

                    ictclasAno.AnnotationCompleted += (s, ea) =>
                        {
                            if (ea.Error == null)
                            {
                                rewrite("100%");
                                Console.WriteLine(MessageAnnotatingEnt);

                                crfppAno.AnnotationProgressChanged += (sen, args) =>
                                    {
                                        Console.WriteLine(args.Message);
                                    };

                                crfppAno.AnnotationCompleted += (sen, args) =>
                                    {
                                        if (args.Error == null)
                                        {
                                            Console.WriteLine(
                                                                            MessageAnnotationFinished);
                                            if (_reportsEnabled)
                                                Console.Write(MessageSavingDocsAndReps);
                                            else
                                                Console.Write(MessageSavingDocuments);
                                            savePosition();

                                            int sCount = 0;
                                            foreach (var doc in corpus.Documents)
                                            {
                                                string reportFileName;

                                                string fileName = documents.ElementAt(
                                                    corpus.Documents.IndexOf(doc));

                                                FileInfo fi = new FileInfo(fileName);
                                                reportFileName = fileName = fi.Name;

                                                fileName = fileName.Remove(
                                                    fileName.Length - 3);
                                                fileName = _outputDir + fileName
                                                    + "xml";

                                                doc.Save(fileName);

                                                if (_reportsEnabled)
                                                    saveReport(doc,
                                                        _reportsDir + reportFileName,
                                                        wemas);

                                                rewrite(String.Format("{0}%",
                                                    (int)((double)++sCount
                                                    / corpus.Documents.Count * 100)));
                                            }

                                            Console.WriteLine(MessageDone);
                                            done = true;
                                        }
                                        else
                                        {
                                            Console.ForegroundColor = ConsoleColor.Red;
                                            Console.WriteLine();
                                            Console.WriteLine(MessageUnhandledException);
                                            Console.WriteLine(args.Error.Message);
                                            Console.WriteLine(args.Error.StackTrace);
                                            done = true;
                                        }
                                    };

                                if (crfppAno.Initialize())
                                {
                                    crfppAno.ProcessCorpusAsync(corpus);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine(MessageEntInitFailed);
                                    done = true;
                                    return;
                                }
                            }
                            else
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine();
                                Console.WriteLine(MessageUnhandledException);
                                Console.WriteLine(ea.Error.Message);
                                Console.WriteLine(ea.Error.StackTrace);
                                done = true;
                            }
                        };

                    if (ictclasAno.Initialize())
                    {
                        ictclasAno.ProcessCorpusAsync(corpus);

                        while (!done)
                            Thread.Sleep(50);
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Red;
                        Console.WriteLine(MessageSegInitFailed);
                    }
                }
                catch (Exception ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine();
                    Console.WriteLine(MessageUnhandledException);
                    Console.WriteLine(ex.Message);
                }

                Console.ForegroundColor = color;
                //pause();
                return;
            }
            else
            {
                //();
                return;
            }
        }