示例#1
0
        public static void Build(DictionaryFormat format,
                                 string inputDirname,
                                 string outputDirname,
                                 string encoding,
                                 bool normalizeEntry)
        {
            Console.WriteLine("building tokeninfo dict...");
            TokenInfoDictionaryBuilder tokenInfoBuilder    = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
            TokenInfoDictionaryWriter  tokenInfoDictionary = tokenInfoBuilder.Build(inputDirname);

            tokenInfoDictionary.Write(outputDirname);
            //tokenInfoDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            //tokenInfoBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            Console.WriteLine("done");

            Console.WriteLine("building unknown word dict...");
            UnknownDictionaryBuilder unkBuilder    = new UnknownDictionaryBuilder(encoding);
            UnknownDictionaryWriter  unkDictionary = unkBuilder.Build(inputDirname);

            unkDictionary.Write(outputDirname);
            //unkDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            //unkBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            Console.WriteLine("done");

            Console.WriteLine("building connection costs...");
            ConnectionCostsWriter connectionCosts
                = ConnectionCostsBuilder.Build(inputDirname + System.IO.Path.DirectorySeparatorChar + "matrix.def");

            connectionCosts.Write(outputDirname);
            Console.WriteLine("done");
        }
示例#2
0
        public void ShouldSetKeyValuePairSeparatorWithinOutput()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .SetKeyValuePairSeparator("|");

            var toString = factory.Compile <int, string>();

            Assert.That(factory.KeyValuePairSeparator, Is.EqualTo("|"));
            Assert.That(toString(Dictionary), Is.EqualTo("{0:'null'|1:'Leroy Jenkins'|7:'James Bond'}"));
        }
示例#3
0
        public void ShouldSetKeyValuePairPrefixAndSuffixWithinOutput()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .SetKeyValuePrefixAndSuffix("<", ">");

            var toString = factory.Compile <int, string>();

            Assert.That(factory.KeyValuePairPrefixAndSuffix, Is.EqualTo(("<", ">")));
            Assert.That(toString(Dictionary), Is.EqualTo("{<0:'null'>, <1:'Leroy Jenkins'>, <7:'James Bond'>}"));
        }
示例#4
0
        public void ShouldSetKeyPrefixAndSuffixWithinOutput()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .SetKeyPrefixAndSuffix("[", "]");

            var toString = factory.Compile <int, string>();

            Assert.That(factory.KeyPrefixAndSuffix, Is.EqualTo(("[", "]")));
            Assert.That(toString(Dictionary), Is.EqualTo("{[0]:'null', [1]:'Leroy Jenkins', [7]:'James Bond'}"));
        }
示例#5
0
        public void ShouldSetValuePrefixAndSuffixWithinOutput()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .SetValuePrefixAndSuffix("(", ")");

            var toString = factory.Compile <int, string>();

            Assert.That(factory.ValuePrefixAndSuffix, Is.EqualTo(("(", ")")));
            Assert.That(toString(Dictionary), Is.EqualTo("{0:(null), 1:(Leroy Jenkins), 7:(James Bond)}"));
        }
示例#6
0
        public void ShouldIncludeDictionaryPrefixAndSuffixIntoOutput()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .SetDictionaryPrefixAndSuffix("<", ">");

            var toString = factory.Compile <int, string>();

            Assert.That(factory.DictionaryPrefixAndSuffix, Is.EqualTo(("<", ">")));
            Assert.That(toString(Dictionary), Is.EqualTo("<0:'null', 1:'Leroy Jenkins', 7:'James Bond'>"));
        }
示例#7
0
        public void ShouldRemovePreviouslyAddedOption()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .AddOptions(DictionaryFormatOptions.ItemPerLine)
                          .RemoveOptions(DictionaryFormatOptions.ItemPerLine);

            var toString = factory.Compile <int, string>();

            Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.None));
            Assert.That(toString(Dictionary), Is.EqualTo("{0:'null', 1:'Leroy Jenkins', 7:'James Bond'}"));
        }
示例#8
0
        public void ShouldApplyNonGenericFunctionToGenericDictionary()
        {
            var toString = DictionaryFormat.CreateDefault().Compile();

            Assert.That(
                toString(new Dictionary <int, string>
            {
                { 7, "James Bond" },
                { 8, "Bill Timothy" },
                { 12, "Sam Johnston" }
            }),
                Is.EqualTo("{7:'James Bond', 8:'Bill Timothy', 12:'Sam Johnston'}")
                );
        }
示例#9
0
        public void ShouldApplyFormatToOldDictionary()
        {
            var toString = DictionaryFormat.CreateDefault().Compile();

            Assert.That(
                toString(new Hashtable
            {
                { 7, "James Bond" },
                { 8, "Bill Timothy" },
                { 12, "Sam Johnston" }
            }),
                Is.EqualTo("{12:'Sam Johnston', 8:'Bill Timothy', 7:'James Bond'}")
                );
        }
示例#10
0
        public void ShouldCreateDefaultFormat()
        {
            ValueTuple <string, string> emptyPrefixAndSuffix = (null, null);

            var factory = DictionaryFormat.CreateDefault();

            var toString = factory.Compile <int, string>();

            Assert.That(factory.KeyValueSeparator, Is.EqualTo(":"));
            Assert.That(factory.KeyValuePairSeparator, Is.EqualTo(", "));
            Assert.That(factory.KeyPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix));
            Assert.That(factory.ValuePrefixAndSuffix, Is.EqualTo(("'", "'")));
            Assert.That(factory.KeyValuePairPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix));
            Assert.That(factory.DictionaryPrefixAndSuffix, Is.EqualTo(("{", "}")));
            Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.None));
            Assert.That(toString(Dictionary), Is.EqualTo("{0:'null', 1:'Leroy Jenkins', 7:'James Bond'}"));
        }
示例#11
0
        public void ShouldInitializeEmptyFormat()
        {
            ValueTuple <string, string> emptyPrefixAndSuffix = (null, null);

            var factory = new DictionaryFormat();

            var toString = factory.Compile <int, string>();

            Assert.That(factory.KeyValueSeparator, Is.Null);
            Assert.That(factory.KeyValuePairSeparator, Is.Null);
            Assert.That(factory.KeyPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix));
            Assert.That(factory.ValuePrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix));
            Assert.That(factory.KeyValuePairPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix));
            Assert.That(factory.DictionaryPrefixAndSuffix, Is.EqualTo(emptyPrefixAndSuffix));
            Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.None));

            Assert.That(toString(Dictionary), Is.EqualTo("0null1Leroy Jenkins7James Bond"));
        }
示例#12
0
        public void ShouldProduceNewLineSeparatedOutputWithLineBreak()
        {
            var factory = DictionaryFormat
                          .CreateDefault()
                          .SetKeyValuePairSeparator(null)
                          .AddOptions(DictionaryFormatOptions.ItemPerLine);

            var toString = factory.Compile <int, string>();

            Assert.That(factory.Options, Is.EqualTo(DictionaryFormatOptions.ItemPerLine));
            Assert.That(toString(Dictionary), Is.EqualTo(
                            new StringBuilder()
                            .AppendLine("{")
                            .AppendLine("0:'null'")
                            .AppendLine("1:'Leroy Jenkins'")
                            .AppendLine("7:'James Bond'")
                            .Append("}")
                            .ToString()
                            ));
        }
示例#13
0
        public void ShouldAlwaysCreateNewInstanceOfDefaultFormat()
        {
            var firstFormat = DictionaryFormat.CreateDefault()
                              .AddOptions(DictionaryFormatOptions.ItemPerLine)
                              .SetKeyPrefixAndSuffix("[", "]")
                              .SetKeyValuePrefixAndSuffix("<", ">");

            var firstToString = firstFormat.Compile <int, string>();

            var secondFormat   = DictionaryFormat.CreateDefault();
            var secondToString = secondFormat.Compile <int, string>();

            Assert.That(secondFormat, Is.Not.SameAs(firstFormat));
            Assert.That(secondToString, Is.Not.SameAs(firstToString));

            Assert.That(
                secondToString(Dictionary),
                Is.EqualTo("{0:'null', 1:'Leroy Jenkins', 7:'James Bond'}")
                );
        }
示例#14
0
 /// <summary>
 /// Use <see cref="DictionaryFormat"/> for converting members of <see cref="IDictionary"/> type
 /// to string. Only affects members which declared types are assignable to <see cref="IDictionary"/>.
 /// </summary>
 /// <param name="setup">
 /// Function accepting default <see cref="DictionaryFormat"/> and returning modified version or brand new
 /// instance to be used by <see cref="object.ToString"/> function being built. When it returns null than
 /// default <see cref="DictionaryFormat"/> (the one passed to it) will be used.
 /// <see cref="DictionaryFormat.CreateDefault"/> will be used when function is omitted (or null passed).
 /// </param>
 /// <returns>Updated <see cref="ToStringBuilder{TTarget}"/> instance.</returns>
 public ToStringBuilder <TTarget> UseDictionaryFormat(
     Func <DictionaryFormat, DictionaryFormat> setup = null
     )
 {
     return(Use(SetupFormat(DictionaryFormat.CreateDefault(), setup).Compile()));
 }
示例#15
0
 /// <summary>
 /// 返回特定词典格式的词性转换映射
 /// </summary>
 /// <param name="DicFormat"></param>
 /// <returns></returns>
 private static Dictionary<string, string> getPosTransformMap(DictionaryFormat DicFormat)
 {
     Dictionary<string, string> PosTrans = new Dictionary<string, string>(50);
     string PosTransString = null;
     switch (DicFormat)
     {
         case DictionaryFormat.SogouW2006:
         case DictionaryFormat.ExcelCSV:
             PosTransString = SogouW2006PosTrans;
             break;
     }
     foreach (string PosT in PosTransString.Split())
     {
         string[] s = PosT.Split('-');
         PosTrans.Add(s[0].ToLower(), s[1].ToLower());
     }
     return PosTrans;
 }
示例#16
0
        /// <summary>
        /// 导入外部词库,词频按照重合词频比例平均值
        /// </summary>
        /// <param name="ImportDicFile">外部词库文件名</param>
        /// <param name="ImportEncoding">外部词库文件编码</param>
        /// <param name="SourceDicFile">源dct文件名</param>
        /// <param name="DestDicFile">目标dct文件名</param>
        /// <param name="DicFormat">外部词库类型</param>
        /// <param name="OddLines">导入的库中无效且不在源库中的数据</param>
        /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param>
        /// <param name="AvgFrqRate">导入文件的平均频度比例</param>
        /// <returns>导入的条数</returns>
        public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0)
        {
            //初始化
            double MaxFrqRate, MinFrqRate;
            WordDictionary.DicWordInfo[] NewWords;
            WordDictionary.DicWordInfo[] ExistWords;
            FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);

            //加载词库
            WordDictionary dict = new WordDictionary();
            if (!dict.Load(SourceDicFile))
                throw new Exception("load source dic file fail");

            //加入新词
            foreach (WordDictionary.DicWordInfo Word in NewWords)
            {
                int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate);
                dict.AddWord(Word.Word, Word.Pos, Frq);
            }

            //保存
            dict.Save(DestDicFile);
            dict.ReleaseDict();
            return NewWords.Length;
        }
示例#17
0
        /// <summary>
        /// 找到导入库和现有库的不同
        /// </summary>
        /// <param name="NewDicFile">导入库文件</param>
        /// <param name="Encoding">导入库文件编码</param>
        /// <param name="DicFormat">导入库文件格式</param>
        /// <param name="SourceDict">原库对象</param>
        /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
        /// <param name="NewWords">输出新词或现有词的新词性</param>
        /// <param name="ExistWords">输出重复词,且词性也相同</param>
        /// <param name="MaxFrqRate">重复词的最大词频比例</param>
        /// <param name="MinFrqRate">重复词的最小词频比例</param>
        /// <param name="AvgFrqRate">重复词的平均词频比例</param>
        public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, WordDictionary SourceDict,
            out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
            out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
        {
            //初始化
            MaxFrqRate = double.MinValue; MinFrqRate = double.MaxValue; decimal SumFrqRate = 0;
            //const string[] CheckPos = new string[] { "n", "ns", "nr", "ng", "v", "j", "m", "vn", "a", "q" };

            //准备词性转换
            Dictionary<string, string> PosTrans = getPosTransformMap(DicFormat);

            //加载词库
            Dictionary<string, WordDictionary.DicWordInfo> OldWords = SourceDict.ToWordDictionary(); ;

            //内存词组
            List<string> Odds = new List<string>(OldWords.Count / 2);
            List<WordDictionary.DicWordInfo> Exists = new List<SharpICTCLAS.WordDictionary.DicWordInfo>(OldWords.Count / 2);
            List<WordDictionary.DicWordInfo> News = new List<WordDictionary.DicWordInfo>(OldWords.Count / 2);

            //加载词库并统计库内有的词的词频,以估算词频转换的比例关系
            foreach (string Line in File.ReadAllLines(NewDicFile, Encoding))
            {
                string Word;
                int Frq;
                string Poses;

                switch (DicFormat)
                {
                    case DictionaryFormat.SogouW2006:
                        string[] s = Line.Split('\t', ' ');
                        Word = s[0];
                        Frq = s.Length == 1 ? -1 : int.Parse(s[1]);
                        Poses = s.Length < 2 ? null : s[2];
                        break;

                    case DictionaryFormat.ExcelCSV:
                    default:
                        int p1 = Line.IndexOf(',');
                        int p2 = Line.IndexOf(',', p1 + 1);
                        Word = Line.Substring(0, p1);
                        Frq = int.Parse(Line.Substring(p1 + 1, p2 - p1 - 1));
                        Poses = Line.Substring(p2 + 1).Trim('"').Trim();
                        break;
                }

                if (string.IsNullOrEmpty(Poses))
                {
                    if (!OldWords.ContainsKey(Word.ToLower())) Odds.Add(Line);
                    continue;
                }

                foreach (string InputPos in Poses.TrimEnd(',').Split(','))
                {
                    if (string.IsNullOrEmpty(InputPos)) continue;
                    //如果映射表中没有,则保留原始词性字母
                    string Pos = PosTrans.ContainsKey(InputPos.ToLower()) ? PosTrans[InputPos.ToLower()] : InputPos.ToLower();

                    //是否存在
                    if (OldWords.ContainsKey(Word.ToLower()) && OldWords[Word.ToLower()].Pos.Contains(Pos))
                    {
                        int SourceFrq = OldWords[Word.ToLower()].Frequence;
                        double FrqR = SourceFrq == 0 ? Frq : (double)Frq / SourceFrq;
                        if (FrqR > MaxFrqRate) MaxFrqRate = FrqR;
                        if (FrqR < MinFrqRate) MinFrqRate = FrqR;
                        SumFrqRate += (decimal)FrqR;
                        Exists.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq));
                    }
                    else //新词或新词性
                    {
                        News.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq));
                    }
                }
            }

            //平均频度转换倍数
            AvgFrqRate = Exists.Count > 0 ? Convert.ToDouble(SumFrqRate / Exists.Count) : 0;

            OddLines = Odds.ToArray();
            NewWords = News.ToArray();
            ExistWords = Exists.ToArray();
        }
示例#18
0
 /// <summary>
 /// 找到导入库和现有库的不同
 /// </summary>
 /// <param name="NewDicFile">导入库文件</param>
 /// <param name="Encoding">导入库文件编码</param>
 /// <param name="DicFormat">导入库文件格式</param>
 /// <param name="SourceDictFileName">原库文件</param>
 /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
 /// <param name="NewWords">输出新词或现有词的新词性</param>
 /// <param name="ExistWords">输出重复词,且词性也相同</param>
 /// <param name="MaxFrqRate">重复词的最大词频比例</param>
 /// <param name="MinFrqRate">重复词的最小词频比例</param>
 /// <param name="AvgFrqRate">重复词的平均词频比例</param>
 public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName,
     out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
     out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
 {
     WordDictionary SourceDict = new WordDictionary();
     if (!SourceDict.Load(SourceDictFileName))
         throw new Exception("load source dic file fail");
     FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);
     SourceDict.ReleaseDict();
 }