public static void Build(DictionaryFormat format, string inputDirname, string outputDirname, string encoding, bool normalizeEntry) { Console.WriteLine("building tokeninfo dict..."); TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry); TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.Build(inputDirname); tokenInfoDictionary.Write(outputDirname); //tokenInfoDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment //tokenInfoBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment Console.WriteLine("done"); Console.WriteLine("building unknown word dict..."); UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding); UnknownDictionaryWriter unkDictionary = unkBuilder.Build(inputDirname); unkDictionary.Write(outputDirname); //unkDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment //unkBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment Console.WriteLine("done"); Console.WriteLine("building connection costs..."); ConnectionCostsWriter connectionCosts = ConnectionCostsBuilder.Build(inputDirname + System.IO.Path.DirectorySeparatorChar + "matrix.def"); connectionCosts.Write(outputDirname); Console.WriteLine("done"); }
public void ShouldSetKeyValuePairSeparatorWithinOutput() { var factory = DictionaryFormat .CreateDefault() .SetKeyValuePairSeparator("|"); var toString = factory.Compile <int, string>(); Assert.That(factory.KeyValuePairSeparator, Is.EqualTo("|")); Assert.That(toString(Dictionary), Is.EqualTo("{0:'null'|1:'Leroy Jenkins'|7:'James Bond'}")); }
public void ShouldSetKeyValuePairPrefixAndSuffixWithinOutput() { var factory = DictionaryFormat .CreateDefault() .SetKeyValuePrefixAndSuffix("<", ">"); var toString = factory.Compile <int, string>(); Assert.That(factory.KeyValuePairPrefixAndSuffix, Is.EqualTo(("<", ">"))); Assert.That(toString(Dictionary), Is.EqualTo("{<0:'null'>, <1:'Leroy Jenkins'>, <7:'James Bond'>}")); }
public void ShouldSetKeyPrefixAndSuffixWithinOutput() { var factory = DictionaryFormat .CreateDefault() .SetKeyPrefixAndSuffix("[", "]"); var toString = factory.Compile <int, string>(); Assert.That(factory.KeyPrefixAndSuffix, Is.EqualTo(("[", "]"))); Assert.That(toString(Dictionary), Is.EqualTo("{[0]:'null', [1]:'Leroy Jenkins', [7]:'James Bond'}")); }
public void ShouldSetValuePrefixAndSuffixWithinOutput() { var factory = DictionaryFormat .CreateDefault() .SetValuePrefixAndSuffix("(", ")"); var toString = factory.Compile <int, string>(); Assert.That(factory.ValuePrefixAndSuffix, Is.EqualTo(("(", ")"))); Assert.That(toString(Dictionary), Is.EqualTo("{0:(null), 1:(Leroy Jenkins), 7:(James Bond)}")); }
public void ShouldIncludeDictionaryPrefixAndSuffixIntoOutput() { var factory = DictionaryFormat .CreateDefault() .SetDictionaryPrefixAndSuffix("<", ">"); var toString = factory.Compile <int, string>(); Assert.That(factory.DictionaryPrefixAndSuffix, Is.EqualTo(("<", ">"))); Assert.That(toString(Dictionary), Is.EqualTo("<0:'null', 1:'Leroy Jenkins', 7:'James Bond'>")); }
public void ShouldRemovePreviouslyAddedOption() { var factory = DictionaryFormat .CreateDefault() .AddOptions(DictionaryFormatOptions.ItemPerLine) .RemoveOptions(DictionaryFormatOptions.ItemPerLine); var toString = factory.Compile <int, string>(); Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.None)); Assert.That(toString(Dictionary), Is.EqualTo("{0:'null', 1:'Leroy Jenkins', 7:'James Bond'}")); }
public void ShouldApplyNonGenericFunctionToGenericDictionary() { var toString = DictionaryFormat.CreateDefault().Compile(); Assert.That( toString(new Dictionary <int, string> { { 7, "James Bond" }, { 8, "Bill Timothy" }, { 12, "Sam Johnston" } }), Is.EqualTo("{7:'James Bond', 8:'Bill Timothy', 12:'Sam Johnston'}") ); }
public void ShouldApplyFormatToOldDictionary() { var toString = DictionaryFormat.CreateDefault().Compile(); Assert.That( toString(new Hashtable { { 7, "James Bond" }, { 8, "Bill Timothy" }, { 12, "Sam Johnston" } }), Is.EqualTo("{12:'Sam Johnston', 8:'Bill Timothy', 7:'James Bond'}") ); }
public void ShouldCreateDefaultFormat() { ValueTuple <string, string> emptyPrefixAndSuffix = (null, null); var factory = DictionaryFormat.CreateDefault(); var toString = factory.Compile <int, string>(); Assert.That(factory.KeyValueSeparator, Is.EqualTo(":")); Assert.That(factory.KeyValuePairSeparator, Is.EqualTo(", ")); Assert.That(factory.KeyPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix)); Assert.That(factory.ValuePrefixAndSuffix, Is.EqualTo(("'", "'"))); Assert.That(factory.KeyValuePairPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix)); Assert.That(factory.DictionaryPrefixAndSuffix, Is.EqualTo(("{", "}"))); Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.None)); Assert.That(toString(Dictionary), Is.EqualTo("{0:'null', 1:'Leroy Jenkins', 7:'James Bond'}")); }
public void ShouldInitializeEmptyFormat() { ValueTuple <string, string> emptyPrefixAndSuffix = (null, null); var factory = new DictionaryFormat(); var toString = factory.Compile <int, string>(); Assert.That(factory.KeyValueSeparator, Is.Null); Assert.That(factory.KeyValuePairSeparator, Is.Null); Assert.That(factory.KeyPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix)); Assert.That(factory.ValuePrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix)); Assert.That(factory.KeyValuePairPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix)); Assert.That(factory.DictionaryPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix)); Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.None)); Assert.That(toString(Dictionary), Is.EqualTo("0null1Leroy Jenkins7James Bond")); }
public void ShouldProduceNewLineSeparatedOutputWithLineBreak() { var factory = DictionaryFormat .CreateDefault() .SetKeyValuePairSeparator(null) .AddOptions(DictionaryFormatOptions.ItemPerLine); var toString = factory.Compile <int, string>(); Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.ItemPerLine)); Assert.That(toString(Dictionary), Is.EqualTo( new StringBuilder() .AppendLine("{") .AppendLine("0:'null'") .AppendLine("1:'Leroy Jenkins'") .AppendLine("7:'James Bond'") .Append("}") .ToString() )); }
public void ShouldAlwaysCreateNewInstanceOfDefaultFormat() { var firstFormat = DictionaryFormat.CreateDefault() .AddOptions(DictionaryFormatOptions.ItemPerLine) .SetKeyPrefixAndSuffix("[", "]") .SetKeyValuePrefixAndSuffix("<", ">"); var firstToString = firstFormat.Compile <int, string>(); var secondFormat = DictionaryFormat.CreateDefault(); var secondToString = secondFormat.Compile <int, string>(); Assert.That(secondFormat, Is.Not.SameAs(firstFormat)); Assert.That(secondToString, Is.Not.SameAs(firstToString)); Assert.That( secondToString(Dictionary), Is.EqualTo("{0:'null', 1:'Leroy Jenkins', 7:'James Bond'}") ); }
/// <summary> /// Use <see cref="DictionaryFormat"/> for converting members of <see cref="IDictionary"/> type /// to string. Only affects members which declared types are assignable to <see cref="IDictionary"/>. /// </summary> /// <param name="setup"> /// Function accepting default <see cref="DictionaryFormat"/> and returning modified version or brand new /// instance to be used by <see cref="object.ToString"/> function being built. When it returns null than /// default <see cref="DictionaryFormat"/> (the one passed to it) will be used. /// <see cref="DictionaryFormat.CreateDefault"/> will be used when function is omitted (or null passed). /// </param> /// <returns>Updated <see cref="ToStringBuilder{TTarget}"/> instance.</returns> public ToStringBuilder <TTarget> UseDictionaryFormat( Func <DictionaryFormat, DictionaryFormat> setup = null ) { return(Use(SetupFormat(DictionaryFormat.CreateDefault(), setup).Compile())); }
/// <summary> /// 返回特定词典格式的词性转换映射 /// </summary> /// <param name="DicFormat"></param> /// <returns></returns> private static Dictionary<string, string> getPosTransformMap(DictionaryFormat DicFormat) { Dictionary<string, string> PosTrans = new Dictionary<string, string>(50); string PosTransString = null; switch (DicFormat) { case DictionaryFormat.SogouW2006: case DictionaryFormat.ExcelCSV: PosTransString = SogouW2006PosTrans; break; } foreach (string PosT in PosTransString.Split()) { string[] s = PosT.Split('-'); PosTrans.Add(s[0].ToLower(), s[1].ToLower()); } return PosTrans; }
/// <summary> /// 导入外部词库,词频按照重合词频比例平均值 /// </summary> /// <param name="ImportDicFile">外部词库文件名</param> /// <param name="ImportEncoding">外部词库文件编码</param> /// <param name="SourceDicFile">源dct文件名</param> /// <param name="DestDicFile">目标dct文件名</param> /// <param name="DicFormat">外部词库类型</param> /// <param name="OddLines">导入的库中无效且不在源库中的数据</param> /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param> /// <param name="AvgFrqRate">导入文件的平均频度比例</param> /// <returns>导入的条数</returns> public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0) { //初始化 double MaxFrqRate, MinFrqRate; WordDictionary.DicWordInfo[] NewWords; WordDictionary.DicWordInfo[] ExistWords; FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); //加载词库 WordDictionary dict = new WordDictionary(); if (!dict.Load(SourceDicFile)) throw new Exception("load source dic file fail"); //加入新词 foreach (WordDictionary.DicWordInfo Word in NewWords) { int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate); dict.AddWord(Word.Word, Word.Pos, Frq); } //保存 dict.Save(DestDicFile); dict.ReleaseDict(); return NewWords.Length; }
/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDict">原库对象</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, WordDictionary SourceDict, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { //初始化 MaxFrqRate = double.MinValue; MinFrqRate = double.MaxValue; decimal SumFrqRate = 0; //const string[] CheckPos = new string[] { "n", "ns", "nr", "ng", "v", "j", "m", "vn", "a", "q" }; //准备词性转换 Dictionary<string, string> PosTrans = getPosTransformMap(DicFormat); //加载词库 Dictionary<string, WordDictionary.DicWordInfo> OldWords = SourceDict.ToWordDictionary(); ; //内存词组 List<string> Odds = new List<string>(OldWords.Count / 2); List<WordDictionary.DicWordInfo> Exists = new List<SharpICTCLAS.WordDictionary.DicWordInfo>(OldWords.Count / 2); List<WordDictionary.DicWordInfo> News = new List<WordDictionary.DicWordInfo>(OldWords.Count / 2); //加载词库并统计库内有的词的词频,以估算词频转换的比例关系 foreach (string Line in File.ReadAllLines(NewDicFile, Encoding)) { string Word; int Frq; string Poses; switch (DicFormat) { case DictionaryFormat.SogouW2006: string[] s = Line.Split('\t', ' '); Word = s[0]; Frq = s.Length == 1 ? -1 : int.Parse(s[1]); Poses = s.Length < 2 ? null : s[2]; break; case DictionaryFormat.ExcelCSV: default: int p1 = Line.IndexOf(','); int p2 = Line.IndexOf(',', p1 + 1); Word = Line.Substring(0, p1); Frq = int.Parse(Line.Substring(p1 + 1, p2 - p1 - 1)); Poses = Line.Substring(p2 + 1).Trim('"').Trim(); break; } if (string.IsNullOrEmpty(Poses)) { if (!OldWords.ContainsKey(Word.ToLower())) Odds.Add(Line); continue; } foreach (string InputPos in Poses.TrimEnd(',').Split(',')) { if (string.IsNullOrEmpty(InputPos)) continue; //如果映射表中没有,则保留原始词性字母 string Pos = PosTrans.ContainsKey(InputPos.ToLower()) ? PosTrans[InputPos.ToLower()] : InputPos.ToLower(); //是否存在 if (OldWords.ContainsKey(Word.ToLower()) && OldWords[Word.ToLower()].Pos.Contains(Pos)) { int SourceFrq = OldWords[Word.ToLower()].Frequence; double FrqR = SourceFrq == 0 ? Frq : (double)Frq / SourceFrq; if (FrqR > MaxFrqRate) MaxFrqRate = FrqR; if (FrqR < MinFrqRate) MinFrqRate = FrqR; SumFrqRate += (decimal)FrqR; Exists.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq)); } else //新词或新词性 { News.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq)); } } } //平均频度转换倍数 AvgFrqRate = Exists.Count > 0 ? Convert.ToDouble(SumFrqRate / Exists.Count) : 0; OddLines = Odds.ToArray(); NewWords = News.ToArray(); ExistWords = Exists.ToArray(); }
/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDictFileName">原库文件</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { WordDictionary SourceDict = new WordDictionary(); if (!SourceDict.Load(SourceDictFileName)) throw new Exception("load source dic file fail"); FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); SourceDict.ReleaseDict(); }