Esempio n. 1
0
        public static void Build(DictionaryFormat format,
                                 string inputDirname,
                                 string outputDirname,
                                 string encoding,
                                 bool normalizeEntry)
        {
            Console.WriteLine("building tokeninfo dict...");
            TokenInfoDictionaryBuilder tokenInfoBuilder    = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
            TokenInfoDictionaryWriter  tokenInfoDictionary = tokenInfoBuilder.Build(inputDirname);

            tokenInfoDictionary.Write(outputDirname);
            //tokenInfoDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            //tokenInfoBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            Console.WriteLine("done");

            Console.WriteLine("building unknown word dict...");
            UnknownDictionaryBuilder unkBuilder    = new UnknownDictionaryBuilder(encoding);
            UnknownDictionaryWriter  unkDictionary = unkBuilder.Build(inputDirname);

            unkDictionary.Write(outputDirname);
            //unkDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            //unkBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment
            Console.WriteLine("done");

            Console.WriteLine("building connection costs...");
            ConnectionCostsWriter connectionCosts
                = ConnectionCostsBuilder.Build(inputDirname + System.IO.Path.DirectorySeparatorChar + "matrix.def");

            connectionCosts.Write(outputDirname);
            Console.WriteLine("done");
        }
Esempio n. 2
0
        public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding)
        {
            UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

            List <string[]> lines   = new List <string[]>();
            Encoding        decoder = Encoding.GetEncoding(encoding);

            using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
                using (TextReader reader = new StreamReader(inputStream, decoder))
                {
                    dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY));


                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
                        // even though the unknown dictionary returns hardcoded null here.
                        string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry
                        lines.Add(parsed);
                    }
                }

            lines.Sort(new ComparerAnonymousHelper());

            foreach (string[] entry in lines)
            {
                dictionary.Put(entry);
            }

            return(dictionary);
        }
Esempio n. 3
0
        public virtual UnknownDictionaryWriter Build(string dirname)
        {
            UnknownDictionaryWriter unkDictionary = ReadDictionaryFile(dirname + System.IO.Path.DirectorySeparatorChar + "unk.def");  //Should be only one file

            ReadCharacterDefinition(dirname + System.IO.Path.DirectorySeparatorChar + "char.def", unkDictionary);
            return(unkDictionary);
        }
        public void TestPut()
        {
            UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);

            try
            {
                unkDic.Put(CSVUtil.Parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*"));
                fail();
            }
#pragma warning disable 168
            catch (Exception e)
#pragma warning restore 168
            {
            }

            String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
            String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";
            String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*";

            unkDic.PutCharacterCategory(0, "ALPHA");
            unkDic.PutCharacterCategory(1, "HIRAGANA");
            unkDic.PutCharacterCategory(2, "KANJI");

            unkDic.Put(CSVUtil.Parse(entry1));
            unkDic.Put(CSVUtil.Parse(entry2));
            unkDic.Put(CSVUtil.Parse(entry3));
        }
        public void TestPutCharacterCategory()
        {
            UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);

            try
            {
                unkDic.PutCharacterCategory(0, "DUMMY_NAME");
                fail();
            }
#pragma warning disable 168
            catch (Exception e)
#pragma warning restore 168
            {
            }

            try
            {
                unkDic.PutCharacterCategory(-1, "KATAKANA");
                fail();
            }
#pragma warning disable 168
            catch (Exception e)
#pragma warning restore 168
            {
            }

            unkDic.PutCharacterCategory(0, "DEFAULT");
            unkDic.PutCharacterCategory(1, "GREEK");
            unkDic.PutCharacterCategory(2, "HIRAGANA");
            unkDic.PutCharacterCategory(3, "KATAKANA");
            unkDic.PutCharacterCategory(4, "KANJI");
        }
Esempio n. 6
0
        public virtual void ReadCharacterDefinition(string filename, UnknownDictionaryWriter dictionary)
        {
            using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
                using (TextReader reader = new StreamReader(inputStream, Encoding.GetEncoding(encoding)))
                {
                    string line = null;

                    while ((line = reader.ReadLine()) != null)
                    {
                        line = Regex.Replace(line, "^\\s", "");
                        line = Regex.Replace(line, "\\s*#.*", "");
                        line = Regex.Replace(line, "\\s+", " ");

                        // Skip empty line or comment line
                        if (line.Length == 0)
                        {
                            continue;
                        }

                        if (line.StartsWith("0x", StringComparison.Ordinal))
                        {                                                    // Category mapping
                            string[] values = new Regex(" ").Split(line, 2); // Split only first space

                            if (!values[0].Contains(".."))
                            {
                                int cp = Convert.ToInt32(values[0], 16);
                                dictionary.PutCharacterCategory(cp, values[1]);
                            }
                            else
                            {
                                string[] codePoints = Regex.Split(values[0], "\\.\\.").TrimEnd();
                                int      cpFrom     = Convert.ToInt32(codePoints[0], 16);
                                int      cpTo       = Convert.ToInt32(codePoints[1], 16);

                                for (int i = cpFrom; i <= cpTo; i++)
                                {
                                    dictionary.PutCharacterCategory(i, values[1]);
                                }
                            }
                        }
                        else
                        {                                                            // Invoke definition
                            string[] values             = line.Split(' ').TrimEnd(); // Consecutive space is merged above
                            string   characterClassName = values[0];
                            int      invoke             = int.Parse(values[1], CultureInfo.InvariantCulture);
                            int      group  = int.Parse(values[2], CultureInfo.InvariantCulture);
                            int      length = int.Parse(values[3], CultureInfo.InvariantCulture);
                            dictionary.PutInvokeDefinition(characterClassName, invoke, group, length);
                        }
                    }
                }
        }