/// <summary> /// Add Pinyin to DB's index/instance tables. /// </summary> private void indexPinyin(CedictEntry entry, int entryId) { // Count only one occurrence List <PinyinSyllable> uniqueList = new List <PinyinSyllable>(); foreach (PinyinSyllable ps in entry.Pinyin) { // Normalize to lower case PinyinSyllable normps = new PinyinSyllable(ps.Text.ToLowerInvariant(), ps.Tone); // Add one instance bool onList = false; foreach (PinyinSyllable x in uniqueList) { if (x.Text == normps.Text && x.Tone == normps.Tone) { onList = true; break; } } if (!onList) { uniqueList.Add(normps); } } // Index each item we have on unique list cmdInsPinyinInstance.Parameters["@syll_count"].Value = uniqueList.Count; cmdInsPinyinInstance.Parameters["@blob_id"].Value = entryId; foreach (PinyinSyllable ps in uniqueList) { int hash = CedictEntry.Hash(ps.Text); cmdInsPinyinInstance.Parameters["@pinyin_hash"].Value = hash; cmdInsPinyinInstance.Parameters["@tone"].Value = ps.Tone; cmdInsPinyinInstance.ExecuteNonQuery(); } }
protected int storeEntry(string simp, string head, string trg, int binId) { cmdInsEntry.Parameters["@hw"].Value = head; cmdInsEntry.Parameters["@trg"].Value = trg; cmdInsEntry.Parameters["@simp_hash"].Value = CedictEntry.Hash(simp); cmdInsEntry.Parameters["@status"].Value = 0; cmdInsEntry.Parameters["@deleted"].Value = 0; cmdInsEntry.Parameters["@bin_id"].Value = binId; cmdInsEntry.ExecuteNonQuery(); return((int)cmdInsEntry.LastInsertedId); }
/// <summary> /// See <see cref="IHeadwordInfo.GetEntries"/>. /// </summary> public void GetEntries(string simp, out CedictEntry[] ced, out CedictEntry[] hdd) { List <CedictEntry> cedList = new List <CedictEntry>(); List <CedictEntry> hddList = new List <CedictEntry>(); int hash = CedictEntry.Hash(simp); // Do we have this hash? HashChainPointer hcp = new HashChainPointer(hash); int pos = Array.BinarySearch(hashPtrs, hcp, new HashComp()); using (BinReader br = new BinReader(dataFileName)) { // CEDICT entries if (pos >= 0 && hashPtrs[pos].CedictPos != 0) { int binPos = hashPtrs[pos].CedictPos; while (binPos != 0) { br.Position = binPos; // Next in chain binPos = br.ReadInt(); // Entry CedictEntry entry = new CedictEntry(br); // Only keep if simplified really is identical // Could be a hash collision if (entry.ChSimpl == simp) { cedList.Add(entry); } } } // HanDeDict entries if (pos >= 0 && hashPtrs[pos].HanDeDictPos != 0) { int binPos = hashPtrs[pos].HanDeDictPos; while (binPos != 0) { br.Position = binPos; // Next in chain binPos = br.ReadInt(); // Entry CedictEntry entry = new CedictEntry(br); // Only keep if simplified really is identical // Could be a hash collision if (entry.ChSimpl == simp) { hddList.Add(entry); } } } } // Our results ced = cedList.ToArray(); hdd = hddList.ToArray(); }
private void dictLine(string line, bool cedict, BinWriter bw) { if (line == "" || line.StartsWith("#")) { return; } // Parse entry CedictEntry entry = parser.ParseEntry(line, 0, null); // Verify that simp, trad and pinyin are equal length if (entry != null) { if (entry.ChSimpl.Length != entry.ChTrad.Length || entry.ChSimpl.Length != entry.PinyinCount) { entry = null; } } // Just count if failed to parse if (entry == null) { if (cedict) { ++cedictDropped; } else { ++hddDropped; } return; } // Serialize int fpos = bw.Position; // First: hash chain: next entry in file with same hash. Will fill later. bw.WriteInt(0); // Then, entry itself entry.Serialize(bw); // Hash simplified and remember file position int hash = CedictEntry.Hash(entry.ChSimpl); List <int> poss; Dictionary <int, List <int> > hashPoss = cedict ? cedictHashPoss : hddHashPoss; if (!hashPoss.ContainsKey(hash)) { poss = new List <int>(); hashPoss[hash] = poss; } else { poss = hashPoss[hash]; } poss.Add(fpos); }
/// <summary> /// See <see cref="ZD.Common.IHeadwordInfo.GetPossibleHeadwords"/>. /// </summary> public HeadwordSyll[][] GetPossibleHeadwords(string simp, bool unihanFilter) { int hash = CedictEntry.Hash(simp); // Do we have this hash? HashChainPointer hcp = new HashChainPointer(hash); int pos = Array.BinarySearch(hashPtrs, hcp, new HashComp()); if (pos < 0 || hashPtrs[pos].CedictPos == 0) { return(new HeadwordSyll[0][]); } // Yes! Read all entries with this hash from chain; keep those where simplified really matches. List <HeadwordSyll[]> cdHeads = new List <HeadwordSyll[]>(); using (BinReader br = new BinReader(dataFileName)) { int binPos = hashPtrs[pos].CedictPos; while (binPos != 0) { br.Position = binPos; // Next in chain binPos = br.ReadInt(); // Entry CedictEntry entry = new CedictEntry(br); // Only keep if simplified really is identical // Could be a hash collision if (entry.ChSimpl == simp) { addHeadIfNew(cdHeads, entry, unihanFilter); } } } if (cdHeads.Count == 0) { return(new HeadwordSyll[0][]); } return(cdHeads.ToArray()); }
private Dictionary <string, HashSet <int> > getPinyinCandidates(List <PinyinSyllable> sylls) { // Prepare Dictionary <string, HashSet <int> > res = new Dictionary <string, HashSet <int> >(); Dictionary <int, string> hashToText = new Dictionary <int, string>(); // Build custom, single query to get *all* instance that // are relevant for any requested syllable // Also initialize result dictionary with pinyin keys StringBuilder sb = new StringBuilder(); sb.Append("SELECT pinyin_hash, tone, syll_count, blob_id FROM pinyin_instances WHERE"); bool first = true; foreach (PinyinSyllable syll in sylls) { // Init dictionary string key = syll.Tone == -1 ? syll.Text : syll.GetDisplayString(false); res[key] = new HashSet <int>(); hashToText[CedictEntry.Hash(syll.Text)] = syll.Text; // Build our custom query if (!first) { sb.Append(" OR"); } else { first = false; } sb.Append(" (pinyin_hash="); sb.Append(CedictEntry.Hash(syll.Text).ToString()); if (syll.Tone != -1) { sb.Append(" AND tone="); sb.Append(syll.Tone.ToString()); } sb.Append(")"); } sb.Append(";"); // Compile and execute SQL command using (MySqlCommand cmd = new MySqlCommand(sb.ToString(), conn)) using (MySqlDataReader rdr = cmd.ExecuteReader()) { while (rdr.Read()) { // Which query syllable is this for? // With or without tone mark. HashSet <int> cands = null; string text = hashToText[rdr.GetInt32(0)]; if (res.ContainsKey(text)) { cands = res[text]; } else { text += rdr.GetInt32(1).ToString(); cands = res[text]; } // Store blob ID cands.Add(rdr.GetInt32(3)); } } // Done return(res); }