public static void TestBiGraphGenerate() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("coreDict 字典装入错误!"); return; } WordDictionary biDict = new WordDictionary(); if (!biDict.Load(biDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"他说的确实在理"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; //---原子分词 List<AtomNode> atomSegment = Segment.AtomSegment(sSentence); //---检索词库,加入所有可能分词方案并存入链表结构 RowFirstDynamicArray<ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict); //---检索所有可能的两两组合 ColumnFirstDynamicArray<ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict); Console.WriteLine(biGraphResult.ToString()); }
//Load unknown recognition dictionary //Load context //type: Unknown words type (including person,place,transliterion and so on) public bool Configure(string sConfigFile, TAG_TYPE type) { //Load the unknown recognition dictionary m_dict.Load(sConfigFile + ".dct"); //Load the unknown recognition context m_roleTag.LoadContext(sConfigFile + ".ctx"); //Set the tagging type m_roleTag.SetTagType(type); switch (type) { case TAG_TYPE.TT_PERSON: case TAG_TYPE.TT_TRANS_PERSON: //Set the special flag for transliterations m_nPOS = -28274; //-'n'*256-'r'; m_sUnknownFlags = "未##人"; break; case TAG_TYPE.TT_PLACE: m_nPOS = -28275; //-'n'*256-'s'; m_sUnknownFlags = "未##地"; break; default: m_nPOS = 0; break; } return(true); }
public bool InitWordSegment(string pPath) { string filename; filename = pPath + "coreDict.dct"; if (!m_dictCore.Load(filename)) { return(false); } filename = pPath + "lexical.ctx"; if (!m_POSTagger.LoadContext(filename)) { return(false); } m_POSTagger.SetTagType(); filename = pPath + "nr"; if (!m_uPerson.Configure(filename, TAG_TYPE.TT_PERSON)) { return(false); } filename = pPath + "ns"; if (!m_uPlace.Configure(filename, TAG_TYPE.TT_PLACE)) { return(false); } filename = pPath + "tr"; if (!m_uTransPerson.Configure(filename, TAG_TYPE.TT_TRANS_PERSON)) { return(false); } filename = pPath + "BigramDict.dct"; if (!m_dictBigram.Load(filename)) { return(false); } return(true); }
/// <summary> /// 得到所有可能的分词方案 /// </summary> /// <returns></returns> public RowFirstDynamicArray<ChainContent> GetSegGraph(string sSentence) { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("字典装入错误!"); return null; } //string sSentence = @"他说的确实实在"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List<AtomNode> atomSegment = Segment.AtomSegment(sSentence); RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict); return m_segGraph; }
/// <summary> /// 导入外部词库,词频按照重合词频比例平均值 /// </summary> /// <param name="ImportDicFile">外部词库文件名</param> /// <param name="ImportEncoding">外部词库文件编码</param> /// <param name="SourceDicFile">源dct文件名</param> /// <param name="DestDicFile">目标dct文件名</param> /// <param name="DicFormat">外部词库类型</param> /// <param name="OddLines">导入的库中无效且不在源库中的数据</param> /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param> /// <param name="AvgFrqRate">导入文件的平均频度比例</param> /// <returns>导入的条数</returns> public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0) { //初始化 double MaxFrqRate, MinFrqRate; WordDictionary.DicWordInfo[] NewWords; WordDictionary.DicWordInfo[] ExistWords; FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); //加载词库 WordDictionary dict = new WordDictionary(); if (!dict.Load(SourceDicFile)) throw new Exception("load source dic file fail"); //加入新词 foreach (WordDictionary.DicWordInfo Word in NewWords) { int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate); dict.AddWord(Word.Word, Word.Pos, Frq); } //保存 dict.Save(DestDicFile); dict.ReleaseDict(); return NewWords.Length; }
/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDictFileName">原库文件</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { WordDictionary SourceDict = new WordDictionary(); if (!SourceDict.Load(SourceDictFileName)) throw new Exception("load source dic file fail"); FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); SourceDict.ReleaseDict(); }
public static void TestBiSegment() { List<string> sentence = new List<string>(); List<string> description = new List<string>(); sentence.Add(@"他说的的确实在理"); description.Add(@"普通分词测试"); sentence.Add(@"张华平3-4月份来北京开会"); description.Add(@"数字切分"); sentence.Add(@"1.加强管理"); description.Add(@"剔除多余的“.”"); sentence.Add(@"他出生于1980年1月1日10点"); description.Add(@"日期合并"); sentence.Add(@"他出生于甲子年"); description.Add(@"年份识别"); sentence.Add(@"馆内陈列周恩来和邓颖超生前使用过的物品"); description.Add(@"姓名识别"); WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("coreDict 字典装入错误!"); return; } WordDictionary biDict = new WordDictionary(); if (!biDict.Load(biDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence; string sDescription; for (int i = 0; i < sentence.Count; i++) { sSentence = sentence[i]; sDescription = description[i]; Console.WriteLine("\r\n============ {0} ============", sDescription); sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List<AtomNode> nodes = Segment.AtomSegment(sSentence); Console.WriteLine("原子切分:"); for (int j = 0; j < nodes.Count; j++) Console.Write("{0}, ", nodes[j].sWord); Console.WriteLine("\r\n\r\n实际切分:"); Segment segment = new Segment(biDict, coreDict); segment.BiSegment(sSentence, 0.1, 1); for (int k = 0; k < segment.m_pWordSeg.Count; k++) { for (int j = 0; j < segment.m_pWordSeg[k].Length; j++) Console.Write("{0}, ", segment.m_pWordSeg[k][j].sWord); Console.WriteLine(); } } }
public static void TestGenerateWordNet() { WordDictionary coreDict = new WordDictionary(); if (!coreDict.Load(coreDictFile)) { Console.WriteLine("字典装入错误!"); return; } string sSentence = @"人民币现在很值钱"; sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END; List<AtomNode> atomSegment = Segment.AtomSegment(sSentence); RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict); Console.WriteLine(m_segGraph.ToString()); }
public static void TestDictionary() { WordDictionary dict = new WordDictionary(); if (dict.Load(coreDictFile, false)) { for (int j = 2; j <= 5; j++) { Console.WriteLine("====================================\r\n汉字:{0}, ID :{1}\r\n", Utility.CC_ID2Char(j), j); Console.WriteLine(" 词长 频率 词性 词"); for (int i = 0; i < dict.indexTable[j].nCount; i++) Console.WriteLine("{0,5} {1,6} {2,5} ({3}){4}", dict.indexTable[j].WordItems[i].nWordLen, dict.indexTable[j].WordItems[i].nFrequency, Utility.GetPOSString(dict.indexTable[j].WordItems[i].nPOS), Utility.CC_ID2Char(j), dict.indexTable[j].WordItems[i].sWord); } } else Console.WriteLine("Wrong!"); }