示例#1
0
        public static void TestBiGraphGenerate()
        {
            WordDictionary coreDict = new WordDictionary();
             if (!coreDict.Load(coreDictFile))
             {
            Console.WriteLine("coreDict 字典装入错误!");
            return;
             }

             WordDictionary biDict = new WordDictionary();
             if (!biDict.Load(biDictFile))
             {
            Console.WriteLine("字典装入错误!");
            return;
             }

             string sSentence = @"他说的确实在理";
             sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

             //---原子分词
             List<AtomNode> atomSegment = Segment.AtomSegment(sSentence);

             //---检索词库,加入所有可能分词方案并存入链表结构
             RowFirstDynamicArray<ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

             //---检索所有可能的两两组合
             ColumnFirstDynamicArray<ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict);

             Console.WriteLine(biGraphResult.ToString());
        }
示例#2
0
        //Load unknown recognition dictionary
        //Load context
        //type: Unknown words type (including person,place,transliterion and so on)
        public bool Configure(string sConfigFile, TAG_TYPE type)
        {
            //Load the unknown recognition dictionary
            m_dict.Load(sConfigFile + ".dct");

            //Load the unknown recognition context
            m_roleTag.LoadContext(sConfigFile + ".ctx");

            //Set the tagging type
            m_roleTag.SetTagType(type);
            switch (type)
            {
            case TAG_TYPE.TT_PERSON:
            case TAG_TYPE.TT_TRANS_PERSON:
                //Set the special flag for transliterations
                m_nPOS          = -28274; //-'n'*256-'r';
                m_sUnknownFlags = "未##人";
                break;

            case TAG_TYPE.TT_PLACE:
                m_nPOS          = -28275; //-'n'*256-'s';
                m_sUnknownFlags = "未##地";
                break;

            default:
                m_nPOS = 0;
                break;
            }
            return(true);
        }
示例#3
0
        public bool InitWordSegment(string pPath)
        {
            string filename;

            filename = pPath + "coreDict.dct";
            if (!m_dictCore.Load(filename))
            {
                return(false);
            }

            filename = pPath + "lexical.ctx";
            if (!m_POSTagger.LoadContext(filename))
            {
                return(false);
            }
            m_POSTagger.SetTagType();

            filename = pPath + "nr";
            if (!m_uPerson.Configure(filename, TAG_TYPE.TT_PERSON))
            {
                return(false);
            }

            filename = pPath + "ns";
            if (!m_uPlace.Configure(filename, TAG_TYPE.TT_PLACE))
            {
                return(false);
            }

            filename = pPath + "tr";
            if (!m_uTransPerson.Configure(filename, TAG_TYPE.TT_TRANS_PERSON))
            {
                return(false);
            }

            filename = pPath + "BigramDict.dct";
            if (!m_dictBigram.Load(filename))
            {
                return(false);
            }

            return(true);
        }
示例#4
0
        /// <summary>
        /// 得到所有可能的分词方案
        /// </summary>
        /// <returns></returns>
        public RowFirstDynamicArray<ChainContent> GetSegGraph(string sSentence)
        {
            WordDictionary coreDict = new WordDictionary();
            if (!coreDict.Load(coreDictFile))
            {
                Console.WriteLine("字典装入错误!");
                return null;
            }

            //string sSentence = @"他说的确实实在";
            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            List<AtomNode> atomSegment = Segment.AtomSegment(sSentence);
            RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict);
            return m_segGraph;
        }
示例#5
0
        /// <summary>
        /// 导入外部词库,词频按照重合词频比例平均值
        /// </summary>
        /// <param name="ImportDicFile">外部词库文件名</param>
        /// <param name="ImportEncoding">外部词库文件编码</param>
        /// <param name="SourceDicFile">源dct文件名</param>
        /// <param name="DestDicFile">目标dct文件名</param>
        /// <param name="DicFormat">外部词库类型</param>
        /// <param name="OddLines">导入的库中无效且不在源库中的数据</param>
        /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param>
        /// <param name="AvgFrqRate">导入文件的平均频度比例</param>
        /// <returns>导入的条数</returns>
        public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0)
        {
            //初始化
            double MaxFrqRate, MinFrqRate;
            WordDictionary.DicWordInfo[] NewWords;
            WordDictionary.DicWordInfo[] ExistWords;
            FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);

            //加载词库
            WordDictionary dict = new WordDictionary();
            if (!dict.Load(SourceDicFile))
                throw new Exception("load source dic file fail");

            //加入新词
            foreach (WordDictionary.DicWordInfo Word in NewWords)
            {
                int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate);
                dict.AddWord(Word.Word, Word.Pos, Frq);
            }

            //保存
            dict.Save(DestDicFile);
            dict.ReleaseDict();
            return NewWords.Length;
        }
示例#6
0
 /// <summary>
 /// 找到导入库和现有库的不同
 /// </summary>
 /// <param name="NewDicFile">导入库文件</param>
 /// <param name="Encoding">导入库文件编码</param>
 /// <param name="DicFormat">导入库文件格式</param>
 /// <param name="SourceDictFileName">原库文件</param>
 /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
 /// <param name="NewWords">输出新词或现有词的新词性</param>
 /// <param name="ExistWords">输出重复词,且词性也相同</param>
 /// <param name="MaxFrqRate">重复词的最大词频比例</param>
 /// <param name="MinFrqRate">重复词的最小词频比例</param>
 /// <param name="AvgFrqRate">重复词的平均词频比例</param>
 public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName,
     out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
     out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
 {
     WordDictionary SourceDict = new WordDictionary();
     if (!SourceDict.Load(SourceDictFileName))
         throw new Exception("load source dic file fail");
     FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);
     SourceDict.ReleaseDict();
 }
示例#7
0
        public static void TestBiSegment()
        {
            List<string> sentence = new List<string>();
             List<string> description = new List<string>();

             sentence.Add(@"他说的的确实在理");
             description.Add(@"普通分词测试");

             sentence.Add(@"张华平3-4月份来北京开会");
             description.Add(@"数字切分");

             sentence.Add(@"1.加强管理");
             description.Add(@"剔除多余的“.”");

             sentence.Add(@"他出生于1980年1月1日10点");
             description.Add(@"日期合并");

             sentence.Add(@"他出生于甲子年");
             description.Add(@"年份识别");

             sentence.Add(@"馆内陈列周恩来和邓颖超生前使用过的物品");
             description.Add(@"姓名识别");

             WordDictionary coreDict = new WordDictionary();
             if (!coreDict.Load(coreDictFile))
             {
            Console.WriteLine("coreDict 字典装入错误!");
            return;
             }

             WordDictionary biDict = new WordDictionary();
             if (!biDict.Load(biDictFile))
             {
            Console.WriteLine("字典装入错误!");
            return;
             }

             string sSentence;
             string sDescription;

             for (int i = 0; i < sentence.Count; i++)
             {
            sSentence = sentence[i];
            sDescription = description[i];
            Console.WriteLine("\r\n============ {0} ============", sDescription);

            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            List<AtomNode> nodes = Segment.AtomSegment(sSentence);
            Console.WriteLine("原子切分:");
            for (int j = 0; j < nodes.Count; j++)
               Console.Write("{0}, ", nodes[j].sWord);

            Console.WriteLine("\r\n\r\n实际切分:");
            Segment segment = new Segment(biDict, coreDict);
            segment.BiSegment(sSentence, 0.1, 1);

            for (int k = 0; k < segment.m_pWordSeg.Count; k++)
            {
               for (int j = 0; j < segment.m_pWordSeg[k].Length; j++)
                  Console.Write("{0}, ", segment.m_pWordSeg[k][j].sWord);
               Console.WriteLine();
            }
             }
        }
示例#8
0
        public static void TestGenerateWordNet()
        {
            WordDictionary coreDict = new WordDictionary();
             if (!coreDict.Load(coreDictFile))
             {
            Console.WriteLine("字典装入错误!");
            return;
             }

             string sSentence = @"人民币现在很值钱";
             sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

             List<AtomNode> atomSegment = Segment.AtomSegment(sSentence);
             RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

             Console.WriteLine(m_segGraph.ToString());
        }
示例#9
0
        public static void TestDictionary()
        {
            WordDictionary dict = new WordDictionary();
             if (dict.Load(coreDictFile, false))
             {
            for (int j = 2; j <= 5; j++)
            {
               Console.WriteLine("====================================\r\n汉字:{0}, ID :{1}\r\n", Utility.CC_ID2Char(j), j);

               Console.WriteLine("  词长  频率  词性   词");
               for (int i = 0; i < dict.indexTable[j].nCount; i++)
                  Console.WriteLine("{0,5} {1,6} {2,5}  ({3}){4}",
                     dict.indexTable[j].WordItems[i].nWordLen,
                     dict.indexTable[j].WordItems[i].nFrequency,
                     Utility.GetPOSString(dict.indexTable[j].WordItems[i].nPOS),
                     Utility.CC_ID2Char(j),
                     dict.indexTable[j].WordItems[i].sWord);
            }
             }
             else
            Console.WriteLine("Wrong!");
        }