Exemple #1
0
        static void saveReport(WEMDocument document, string fileName,
            AnnotationSet annotationSet)
        {
            using (StreamWriter writer = new StreamWriter(
                new FileStream(fileName, FileMode.OpenOrCreate)))
            {
                foreach (var sen in document.Sentences)
                {
                    foreach (var word in sen.Words)
                    {
                        if (word is Entity)
                        {
                            Entity entity = (Entity)word;
                            writer.Write(String.Format("[{0} \\ {1}] ",
                                entity.Content,
                                annotationSet[entity.EntityId]));
                        }
                        else
                        {
                            writer.Write(word.Content + ' ');
                        }
                    }
                    writer.WriteLine();
                }

                writer.Close();
            }
        }
Exemple #2
0
        /// <summary>
        /// 向文档集中添加一个文档。
        /// </summary>
        /// <param name="doc">要添加的文档</param>
        public void AddDocument(WEMDocument doc)
        {
            if (doc == null)
                throw new ArgumentNullException("doc");

            if (((IEncodingConvertible)doc).Encoding != this.Encoding)
                ((IEncodingConvertible)doc).ChangeEncoding(this.Encoding);

            _documents.Add(doc);
        }
Exemple #3
0
        public WEMDocument[] ReadDocuments(Stream textStream)
        {
            OutputEncoding = Encoding.UTF8;

            Sentence[] sens = _sententceFactory.GetSentences(textStream);

            WEMDocument doc = new WEMDocument(OutputEncoding);

            foreach (var sen in sens)
            {
                doc.AddSentence(sen);
            }

            doc.ApplicationId = AnnotationSet.Id;

            return new WEMDocument[] { doc };
        }
Exemple #4
0
        public void Start()
        {
            if (startCheck())
            {
                AnnotationSet wemas;
                ICTCLASAnnotator ictclasAno;
                CRFPPAnnotator crfppAno;
                Corpus corpus;

                Console.BackgroundColor = ConsoleColor.DarkBlue;
                Console.Clear();
                writeTitle();

                ConsoleColor color = Console.ForegroundColor;
                Console.ForegroundColor = ConsoleColor.Green;

                try
                {
                    Console.WriteLine(MessageInitializing);

                    wemas = new AnnotationSet();
                    wemas.Load(_annotationSet);

                    ictclasAno = new ICTCLASAnnotator(wemas, null);
                    crfppAno = new CRFPPAnnotator(wemas,
                        new Model(_crfppModel));
                    crfppAno.SetCRFPPRootPath(_crfppDir);

                    corpus = new Corpus(Encoding.GetEncoding(_xmlEncoding));

                    Console.Write(MessageLoadingCorpus);
                    savePosition();

                    var documents = from file in Directory.GetFiles(_corpusDir)
                                    where file.ToUpper().EndsWith(".TXT")
                                    select file;

                    if (documents.Count() == 0)
                    {
                        Console.WriteLine();
                        Console.WriteLine(MessageEmptyCorpus);
                        Console.WriteLine(MessageDone);
                        Console.ForegroundColor = color;
                        pause();
                        return;
                    }

                    _sentenceFactory
                        = new SentenceFactory(wemas);
                    _sentenceFactory.InputLanguage
                        = Language.SimplifiedChinese;
                    _sentenceFactory.OutputEncoding = corpus.Encoding;

                    int count = 0;
                    foreach (var doc in documents)
                    {
                        Sentence[] sens = _sentenceFactory.GetSentences(
                            new FileStream(doc, FileMode.Open));

                        WEMDocument wDoc = new WEMDocument(corpus.Encoding);
                        wDoc.Name = doc;

                        foreach (var sen in sens)
                        {
                            wDoc.AddSentence(sen);
                        }

                        corpus.AddDocument(wDoc);

                        rewrite(String.Format("{0}%", (int)((double)
                        ++count / documents.Count() * 100)));
                    }

                    Console.WriteLine("共加载{0}篇文档。",
                        corpus.Documents.Count);
                    Console.Write(MessageAnnotatingSeg);
                    savePosition();

                    bool done = false;

                    ictclasAno.AnnotationProgressChanged += (s, ea) =>
                        {
                            rewrite(String.Format("{0}%",
                                Math.Round(ea.ProgressInDouble, 2) * 100));
                        };

                    ictclasAno.AnnotationCompleted += (s, ea) =>
                        {
                            if (ea.Error == null)
                            {
                                rewrite("100%");
                                Console.WriteLine(MessageAnnotatingEnt);

                                crfppAno.AnnotationProgressChanged += (sen, args) =>
                                    {
                                        Console.WriteLine(args.Message);
                                    };

                                crfppAno.AnnotationCompleted += (sen, args) =>
                                    {
                                        if (args.Error == null)
                                        {
                                            Console.WriteLine(
                                                                            MessageAnnotationFinished);
                                            if (_reportsEnabled)
                                                Console.Write(MessageSavingDocsAndReps);
                                            else
                                                Console.Write(MessageSavingDocuments);
                                            savePosition();

                                            int sCount = 0;
                                            foreach (var doc in corpus.Documents)
                                            {
                                                string reportFileName;

                                                string fileName = documents.ElementAt(
                                                    corpus.Documents.IndexOf(doc));

                                                FileInfo fi = new FileInfo(fileName);
                                                reportFileName = fileName = fi.Name;

                                                fileName = fileName.Remove(
                                                    fileName.Length - 3);
                                                fileName = _outputDir + fileName
                                                    + "xml";

                                                doc.Save(fileName);

                                                if (_reportsEnabled)
                                                    saveReport(doc,
                                                        _reportsDir + reportFileName,
                                                        wemas);

                                                rewrite(String.Format("{0}%",
                                                    (int)((double)++sCount
                                                    / corpus.Documents.Count * 100)));
                                            }

                                            Console.WriteLine(MessageDone);
                                            done = true;
                                        }
                                        else
                                        {
                                            Console.ForegroundColor = ConsoleColor.Red;
                                            Console.WriteLine();
                                            Console.WriteLine(MessageUnhandledException);
                                            Console.WriteLine(args.Error.Message);
                                            Console.WriteLine(args.Error.StackTrace);
                                            done = true;
                                        }
                                    };

                                if (crfppAno.Initialize())
                                {
                                    crfppAno.ProcessCorpusAsync(corpus);
                                }
                                else
                                {
                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine(MessageEntInitFailed);
                                    done = true;
                                    return;
                                }
                            }
                            else
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine();
                                Console.WriteLine(MessageUnhandledException);
                                Console.WriteLine(ea.Error.Message);
                                Console.WriteLine(ea.Error.StackTrace);
                                done = true;
                            }
                        };

                    if (ictclasAno.Initialize())
                    {
                        ictclasAno.ProcessCorpusAsync(corpus);

                        while (!done)
                            Thread.Sleep(50);
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Red;
                        Console.WriteLine(MessageSegInitFailed);
                    }
                }
                catch (Exception ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine();
                    Console.WriteLine(MessageUnhandledException);
                    Console.WriteLine(ex.Message);
                }

                Console.ForegroundColor = color;
                //pause();
                return;
            }
            else
            {
                //();
                return;
            }
        }
Exemple #5
0
 /// <summary>
 /// 从文档集中删除一个文档。
 /// </summary>
 /// <param name="doc">要删除的文档</param>
 public void RemoveDocument(WEMDocument doc)
 {
     _documents.Remove(doc);
 }
Exemple #6
0
        static void ProcessCommand(string command)
        {
            string[] blocks = command.Split();

            switch (blocks[0].ToLower())
            {
                case "":
                    return;

                case "exit":
                    if (blocks.Length == 1)
                        _exitFlag = true;
                    else
                        Console.WriteLine("Unknown Parameters.");
                    return;

                case "clear":
                    if (blocks.Length == 1)
                        Console.Clear();
                    else
                        Console.WriteLine("Unknown Parameters.");
                    return;

                case "genas":
                    AnnotationDictionary dict = new AnnotationDictionary();
                    AnnotationSet ans = new AnnotationSet("WEMAS");
                    dict.Load("WEMAS.xml");

                    foreach (var entry in dict._Dictionary)
                    {
                        ans.SetAnnotationDescription(entry.Key, entry.Value);
                    }

                    foreach (var sep in dict.SentenceSeparators)
                    {
                        ans.AddSentenceSeparator(sep);
                    }

                    ans.Save("WEMAS.wemas");
                    return;

                case "testas":
                    AnnotationSet ans2 = new AnnotationSet("WEMAS");
                    ans2.Load("WEMAS.wemas");
                    ans2.Description =
                        "WEB ENTITY MINER Default Annotation Set.";

                    ans2.Save("WEMAS2.wemas");
                    return;

                case "testws":
                    AnnotationSet wemas = new AnnotationSet();
                    wemas.Load("WEMAS.wemas");

                    ICTCLASAnnotator ano = new ICTCLASAnnotator(wemas, null);
                    CRFPPAnnotator crfAno = new CRFPPAnnotator(wemas,
                        new Model(
                            AppDomain.CurrentDomain.BaseDirectory
                            + "model.crfppmodel"));

                    Corpus c = new Corpus(Encoding.UTF8);
                    SentenceFactory.AnnotationSet = wemas;
                    SentenceFactory.InputLanguage = Language.SimplifiedChinese;
                    SentenceFactory.OutputEncoding = Encoding.UTF8;

                    var sens = SentenceFactory.GetSentences(
                        new FileStream("utf8.txt", FileMode.Open, FileAccess.Read));

                    WEMDocument doc = new WEMDocument(Encoding.UTF8);
                    foreach (var sen in sens)
                    {
                        doc.AddSentence(sen);
                    }

                    c.AddDocument(doc);

                    try
                    {
                        ano.ProcessCorpus(c);
                        crfAno.ProcessCorpus(c);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("Unhandled Exception:\n{0}",
                            ex.Message);
                    }

                    doc.Save("RESULT.xml");

                    foreach (var sententce in doc.Sentences)
                    {
                        foreach (var word in sententce.Words)
                        {
                            if (word is Entity)
                            {
                                Console.WriteLine("{0}/ENTITY:{1}",
                                    word.Content,
                                    wemas[((Entity)word).EntityId]);
                            }
                            else
                            {
                                Console.WriteLine("{0}", word.Content);
                            }
                        }
                    }
                    return;

                default:
                    Console.WriteLine("Unknown Command: '{0}'.", blocks[0]);
                    return;
            }
        }
Exemple #7
0
 public void CreateDocument()
 {
     if (!_documents.ContainsKey(DocumentName))
     {
         WEMDocument doc = new WEMDocument(_encoding);
         doc.Name = DocumentName;
         _documents.Add(DocumentName, doc);
     }
 }