protected virtual void WritePosDict(string filename)
 {
     //new File(filename).getParentFile().mkdirs();
     System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
     using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write))
     {
         DataOutput @out = new OutputStreamDataOutput(os);
         CodecUtil.WriteHeader(@out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
         @out.WriteVInt32(posDict.Count);
         foreach (string s in posDict)
         {
             if (s == null)
             {
                 @out.WriteByte((byte)0);
                 @out.WriteByte((byte)0);
                 @out.WriteByte((byte)0);
             }
             else
             {
                 string[] data = CSVUtil.Parse(s);
                 if (Debugging.AssertsEnabled)
                 {
                     Debugging.Assert(data.Length == 3, () => "malformed pos/inflection: " + s);
                 }
                 @out.WriteString(data[0]);
                 @out.WriteString(data[1]);
                 @out.WriteString(data[2]);
             }
         }
     }
 }
Beispiel #2
0
        public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding)
        {
            UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

            List <string[]> lines   = new List <string[]>();
            Encoding        decoder = Encoding.GetEncoding(encoding);

            using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
                using (TextReader reader = new StreamReader(inputStream, decoder))
                {
                    dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY));


                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
                        // even though the unknown dictionary returns hardcoded null here.
                        string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry
                        lines.Add(parsed);
                    }
                }

            lines.Sort(new ComparerAnonymousHelper());

            foreach (string[] entry in lines)
            {
                dictionary.Put(entry);
            }

            return(dictionary);
        }
        public void TestPut()
        {
            UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);

            try
            {
                unkDic.Put(CSVUtil.Parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*"));
                fail();
            }
#pragma warning disable 168
            catch (Exception e)
#pragma warning restore 168
            {
            }

            String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
            String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";
            String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*";

            unkDic.PutCharacterCategory(0, "ALPHA");
            unkDic.PutCharacterCategory(1, "HIRAGANA");
            unkDic.PutCharacterCategory(2, "KANJI");

            unkDic.Put(CSVUtil.Parse(entry1));
            unkDic.Put(CSVUtil.Parse(entry2));
            unkDic.Put(CSVUtil.Parse(entry3));
        }
        /// <summary>
        /// Put the entry in map.
        /// </summary>
        /// <param name="entry"></param>
        /// <returns>Current position of buffer, which will be wordId of next entry.</returns>
        public virtual int Put(string[] entry)
        {
            short leftId   = short.Parse(entry[1], CultureInfo.InvariantCulture);
            short rightId  = short.Parse(entry[2], CultureInfo.InvariantCulture);
            short wordCost = short.Parse(entry[3], CultureInfo.InvariantCulture);

            StringBuilder sb = new StringBuilder();

            // build up the POS string
            for (int i = 4; i < 8; i++)
            {
                string part = entry[i];
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(part.Length > 0);
                }
                if (!"*".Equals(part, StringComparison.Ordinal))
                {
                    if (sb.Length > 0)
                    {
                        sb.Append('-');
                    }
                    sb.Append(part);
                }
            }

            string posData = sb.ToString();

            sb.Length = 0;
            sb.Append(CSVUtil.QuoteEscape(posData));
            sb.Append(',');
            if (!"*".Equals(entry[8], StringComparison.Ordinal))
            {
                sb.Append(CSVUtil.QuoteEscape(entry[8]));
            }
            sb.Append(',');
            if (!"*".Equals(entry[9], StringComparison.Ordinal))
            {
                sb.Append(CSVUtil.QuoteEscape(entry[9]));
            }
            string fullPOSData = sb.ToString();

            string baseForm      = entry[10];
            string reading       = entry[11];
            string pronunciation = entry[12];

            // extend buffer if necessary
            int left = m_buffer.Remaining;
            // worst case: two short, 3 bytes, and features (all as utf-16)
            int worstCase = 4 + 3 + 2 * (baseForm.Length + reading.Length + pronunciation.Length);

            if (worstCase > left)
            {
                ByteBuffer newBuffer = ByteBuffer.Allocate(ArrayUtil.Oversize(m_buffer.Limit + worstCase - left, 1));
                m_buffer.Flip();
                newBuffer.Put(m_buffer);
                m_buffer = newBuffer;
            }

            int flags = 0;

            if (!("*".Equals(baseForm, StringComparison.Ordinal) || baseForm.Equals(entry[0], StringComparison.Ordinal)))
            {
                flags |= BinaryDictionary.HAS_BASEFORM;
            }
            if (!reading.Equals(ToKatakana(entry[0]), StringComparison.Ordinal))
            {
                flags |= BinaryDictionary.HAS_READING;
            }
            if (!pronunciation.Equals(reading, StringComparison.Ordinal))
            {
                flags |= BinaryDictionary.HAS_PRONUNCIATION;
            }

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(leftId == rightId);
                Debugging.Assert(leftId < 4096); // there are still unused bits
            }
            // add pos mapping
            int toFill = 1 + leftId - posDict.Count;

            for (int i = 0; i < toFill; i++)
            {
                posDict.Add(null);
            }

            string existing = posDict[leftId];

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(existing == null || existing.Equals(fullPOSData, StringComparison.Ordinal));
            }
            posDict[leftId] = fullPOSData;

            m_buffer.PutInt16((short)(leftId << 3 | flags));
            m_buffer.PutInt16(wordCost);

            if ((flags & BinaryDictionary.HAS_BASEFORM) != 0)
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(baseForm.Length < 16);
                }
                int shared = SharedPrefix(entry[0], baseForm);
                int suffix = baseForm.Length - shared;
                m_buffer.Put((byte)(shared << 4 | suffix));
                for (int i = shared; i < baseForm.Length; i++)
                {
                    m_buffer.PutChar(baseForm[i]);
                }
            }

            if ((flags & BinaryDictionary.HAS_READING) != 0)
            {
                if (IsKatakana(reading))
                {
                    m_buffer.Put((byte)(reading.Length << 1 | 1));
                    WriteKatakana(reading);
                }
                else
                {
                    m_buffer.Put((byte)(reading.Length << 1));
                    for (int i = 0; i < reading.Length; i++)
                    {
                        m_buffer.PutChar(reading[i]);
                    }
                }
            }

            if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0)
            {
                // we can save 150KB here, but it makes the reader a little complicated.
                // int shared = sharedPrefix(reading, pronunciation);
                // buffer.put((byte) shared);
                // pronunciation = pronunciation.substring(shared);
                if (IsKatakana(pronunciation))
                {
                    m_buffer.Put((byte)(pronunciation.Length << 1 | 1));
                    WriteKatakana(pronunciation);
                }
                else
                {
                    m_buffer.Put((byte)(pronunciation.Length << 1));
                    for (int i = 0; i < pronunciation.Length; i++)
                    {
                        m_buffer.PutChar(pronunciation[i]);
                    }
                }
            }

            return(m_buffer.Position);
        }
Beispiel #5
0
        public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles)
        {
            TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

            // all lines in the file
            Console.WriteLine("  parse...");
            List <string[]> lines = new List <string[]>(400000);

            foreach (string file in csvFiles)
            {
                using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read))
                {
                    Encoding   decoder = Encoding.GetEncoding(encoding);
                    TextReader reader  = new StreamReader(inputStream, decoder);

                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        string[] entry = CSVUtil.Parse(line);

                        if (entry.Length < 13)
                        {
                            Console.WriteLine("Entry in CSV is not valid: " + line);
                            continue;
                        }

                        string[] formatted = FormatEntry(entry);
                        lines.Add(formatted);

                        // NFKC normalize dictionary entry
                        if (normalizeEntries)
                        {
                            //if (normalizer.isNormalized(entry[0])){
                            if (entry[0].IsNormalized(NormalizationForm.FormKC))
                            {
                                continue;
                            }
                            string[] normalizedEntry = new string[entry.Length];
                            for (int i = 0; i < entry.Length; i++)
                            {
                                //normalizedEntry[i] = normalizer.normalize(entry[i]);
                                normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
                            }

                            formatted = FormatEntry(normalizedEntry);
                            lines.Add(formatted);
                        }
                    }
                }
            }

            Console.WriteLine("  sort...");

            // sort by term: we sorted the files already and use a stable sort.
            lines.Sort(new ComparerAnonymousHelper());

            Console.WriteLine("  encode...");

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <long?>      fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
            Int32sRef            scratch    = new Int32sRef();
            long   ord       = -1; // first ord will be 0
            string lastValue = null;

            // build tokeninfo dictionary
            foreach (string[] entry in lines)
            {
                int next = dictionary.Put(entry);

                if (next == offset)
                {
                    Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
                    continue;
                }

                string token = entry[0];
                if (!token.Equals(lastValue, StringComparison.Ordinal))
                {
                    // new word to add to fst
                    ord++;
                    lastValue = token;
                    scratch.Grow(token.Length);
                    scratch.Length = token.Length;
                    for (int i = 0; i < token.Length; i++)
                    {
                        scratch.Int32s[i] = (int)token[i];
                    }
                    fstBuilder.Add(scratch, ord);
                }
                dictionary.AddMapping((int)ord, offset);
                offset = next;
            }

            FST <long?> fst = fstBuilder.Finish();

            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
            dictionary.SetFST(fst);
            Console.WriteLine(" done");

            return(dictionary);
        }