protected virtual void WritePosDict(string filename) { //new File(filename).getParentFile().mkdirs(); System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename)); using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write)) { DataOutput @out = new OutputStreamDataOutput(os); CodecUtil.WriteHeader(@out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION); @out.WriteVInt32(posDict.Count); foreach (string s in posDict) { if (s == null) { @out.WriteByte((byte)0); @out.WriteByte((byte)0); @out.WriteByte((byte)0); } else { string[] data = CSVUtil.Parse(s); if (Debugging.AssertsEnabled) { Debugging.Assert(data.Length == 3, () => "malformed pos/inflection: " + s); } @out.WriteString(data[0]); @out.WriteString(data[1]); @out.WriteString(data[2]); } } } }
public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding) { UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); List <string[]> lines = new List <string[]>(); Encoding decoder = Encoding.GetEncoding(encoding); using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) using (TextReader reader = new StreamReader(inputStream, decoder)) { dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY)); string line = null; while ((line = reader.ReadLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry lines.Add(parsed); } } lines.Sort(new ComparerAnonymousHelper()); foreach (string[] entry in lines) { dictionary.Put(entry); } return(dictionary); }
public void TestPut() { UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); try { unkDic.Put(CSVUtil.Parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")); fail(); } #pragma warning disable 168 catch (Exception e) #pragma warning restore 168 { } String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*"; String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*"; String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*"; unkDic.PutCharacterCategory(0, "ALPHA"); unkDic.PutCharacterCategory(1, "HIRAGANA"); unkDic.PutCharacterCategory(2, "KANJI"); unkDic.Put(CSVUtil.Parse(entry1)); unkDic.Put(CSVUtil.Parse(entry2)); unkDic.Put(CSVUtil.Parse(entry3)); }
/// <summary> /// Put the entry in map. /// </summary> /// <param name="entry"></param> /// <returns>Current position of buffer, which will be wordId of next entry.</returns> public virtual int Put(string[] entry) { short leftId = short.Parse(entry[1], CultureInfo.InvariantCulture); short rightId = short.Parse(entry[2], CultureInfo.InvariantCulture); short wordCost = short.Parse(entry[3], CultureInfo.InvariantCulture); StringBuilder sb = new StringBuilder(); // build up the POS string for (int i = 4; i < 8; i++) { string part = entry[i]; if (Debugging.AssertsEnabled) { Debugging.Assert(part.Length > 0); } if (!"*".Equals(part, StringComparison.Ordinal)) { if (sb.Length > 0) { sb.Append('-'); } sb.Append(part); } } string posData = sb.ToString(); sb.Length = 0; sb.Append(CSVUtil.QuoteEscape(posData)); sb.Append(','); if (!"*".Equals(entry[8], StringComparison.Ordinal)) { sb.Append(CSVUtil.QuoteEscape(entry[8])); } sb.Append(','); if (!"*".Equals(entry[9], StringComparison.Ordinal)) { sb.Append(CSVUtil.QuoteEscape(entry[9])); } string fullPOSData = sb.ToString(); string baseForm = entry[10]; string reading = entry[11]; string pronunciation = entry[12]; // extend buffer if necessary int left = m_buffer.Remaining; // worst case: two short, 3 bytes, and features (all as utf-16) int worstCase = 4 + 3 + 2 * (baseForm.Length + reading.Length + pronunciation.Length); if (worstCase > left) { ByteBuffer newBuffer = ByteBuffer.Allocate(ArrayUtil.Oversize(m_buffer.Limit + worstCase - left, 1)); m_buffer.Flip(); newBuffer.Put(m_buffer); m_buffer = newBuffer; } int flags = 0; if (!("*".Equals(baseForm, StringComparison.Ordinal) || baseForm.Equals(entry[0], StringComparison.Ordinal))) { flags |= BinaryDictionary.HAS_BASEFORM; } if (!reading.Equals(ToKatakana(entry[0]), StringComparison.Ordinal)) { flags |= BinaryDictionary.HAS_READING; } if (!pronunciation.Equals(reading, StringComparison.Ordinal)) { flags |= BinaryDictionary.HAS_PRONUNCIATION; } if (Debugging.AssertsEnabled) { Debugging.Assert(leftId == rightId); Debugging.Assert(leftId < 4096); // there are still unused bits } // add pos mapping int toFill = 1 + leftId - posDict.Count; for (int i = 0; i < toFill; i++) { posDict.Add(null); } string existing = posDict[leftId]; if (Debugging.AssertsEnabled) { Debugging.Assert(existing == null || existing.Equals(fullPOSData, StringComparison.Ordinal)); } posDict[leftId] = fullPOSData; m_buffer.PutInt16((short)(leftId << 3 | flags)); m_buffer.PutInt16(wordCost); if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) { if (Debugging.AssertsEnabled) { Debugging.Assert(baseForm.Length < 16); } int shared = SharedPrefix(entry[0], baseForm); int suffix = baseForm.Length - shared; m_buffer.Put((byte)(shared << 4 | suffix)); for (int i = shared; i < baseForm.Length; i++) { m_buffer.PutChar(baseForm[i]); } } if ((flags & BinaryDictionary.HAS_READING) != 0) { if (IsKatakana(reading)) { m_buffer.Put((byte)(reading.Length << 1 | 1)); WriteKatakana(reading); } else { m_buffer.Put((byte)(reading.Length << 1)); for (int i = 0; i < reading.Length; i++) { m_buffer.PutChar(reading[i]); } } } if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) { // we can save 150KB here, but it makes the reader a little complicated. // int shared = sharedPrefix(reading, pronunciation); // buffer.put((byte) shared); // pronunciation = pronunciation.substring(shared); if (IsKatakana(pronunciation)) { m_buffer.Put((byte)(pronunciation.Length << 1 | 1)); WriteKatakana(pronunciation); } else { m_buffer.Put((byte)(pronunciation.Length << 1)); for (int i = 0; i < pronunciation.Length; i++) { m_buffer.PutChar(pronunciation[i]); } } } return(m_buffer.Position); }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }