/// <summary> /// Ctor: deserializes binary data. /// </summary> public Index(BinReader br) { WordHolder = new CedictEngine.WordHolder(br); SenseIndex = new Dictionary <int, SenseIndexItem>(); int senseIndexKeyCount = br.ReadInt(); for (int i = 0; i != senseIndexKeyCount; ++i) { int tokenId = br.ReadInt(); SenseIndexItem sii = new SenseIndexItem(br); SenseIndex[tokenId] = sii; } IdeoIndex = new Dictionary <char, IdeoIndexItem>(); PinyinIndex = new Dictionary <string, PinyinIndexItem>(); int ideoIndexKeyCount = br.ReadInt(); for (int i = 0; i != ideoIndexKeyCount; ++i) { char c = br.ReadChar(); IdeoIndexItem iii = new IdeoIndexItem(br); IdeoIndex[c] = iii; } int pinyinIndexKeyCount = br.ReadInt(); for (int i = 0; i != pinyinIndexKeyCount; ++i) { string str = br.ReadString(); PinyinIndexItem pyi = new PinyinIndexItem(br); PinyinIndex[str] = pyi; } }
/// <summary> /// Indexes one parsed Cedict entry (hanzi, pinyin and target-language indexes). /// </summary> private void indexEntry(CedictEntry entry, int id) { // Collect different chars in both headwords HashSet <char> simpSet = new HashSet <char>(); foreach (char c in entry.ChSimpl) { simpSet.Add(c); } if (simpSet.Count > byte.MaxValue) { throw new Exception("Simplified headword too long; max: 255."); } byte simpCount = (byte)simpSet.Count; HashSet <char> tradSet = new HashSet <char>(); foreach (char c in entry.ChTrad) { tradSet.Add(c); } if (tradSet.Count > byte.MaxValue) { throw new Exception("Traditional headword too long; max: 255."); } byte tradCount = (byte)tradSet.Count; // Index character of simplified headword foreach (char c in simpSet) { IdeoIndexItem ii; if (index.IdeoIndex.ContainsKey(c)) { ii = index.IdeoIndex[c]; } else { ii = new IdeoIndexItem(); index.IdeoIndex[c] = ii; } ii.EntriesHeadwordSimp.Add(new IdeoEntryPtr { EntryIdx = id, HwCharCount = simpCount }); } // Index characters of traditional headword foreach (char c in tradSet) { IdeoIndexItem ii; if (index.IdeoIndex.ContainsKey(c)) { ii = index.IdeoIndex[c]; } else { ii = new IdeoIndexItem(); index.IdeoIndex[c] = ii; } ii.EntriesHeadwordTrad.Add(new IdeoEntryPtr { EntryIdx = id, HwCharCount = tradCount }); } // Index pinyin syllables foreach (PinyinSyllable pys in entry.Pinyin) { PinyinIndexItem pi; // Index contains lower-case syllables string textLo = pys.Text.ToLowerInvariant(); if (index.PinyinIndex.ContainsKey(textLo)) { pi = index.PinyinIndex[textLo]; } else { pi = new PinyinIndexItem(); index.PinyinIndex[textLo] = pi; } // Figure out which list in index item - by tone List <int> entryList; if (pys.Tone == -1) { entryList = pi.EntriesNT; } else if (pys.Tone == 0) { entryList = pi.Entries0; } else if (pys.Tone == 1) { entryList = pi.Entries1; } else if (pys.Tone == 2) { entryList = pi.Entries2; } else if (pys.Tone == 3) { entryList = pi.Entries3; } else if (pys.Tone == 4) { entryList = pi.Entries4; } else { throw new Exception("Invalid tone: " + pys.Tone.ToString()); } // Avoid indexing same entry twice if a syllable occurs multiple times if (entryList.Count == 0 || entryList[entryList.Count - 1] != id) { entryList.Add(id); } } // Index equiv of each sense int senseIx = -1; foreach (CedictSense sense in entry.Senses) { ++senseIx; // Empty equiv: nothing to index if (sense.Equiv.IsEmpty) { continue; } // Tokenize ReadOnlyCollection <EquivToken> tokens = tokenizer.Tokenize(sense.Equiv); // Index sense indexSense(tokens, id, senseIx); } }
/// <summary> /// Retrieves entries (sorted) whose headword contains hanzi from search expression. /// </summary> List <CedictResult> doHanziLookupHead(BinReader br, string query, SearchScript script) { // Get every character once - we ignore repeats HashSet <char> queryChars = new HashSet <char>(); foreach (char c in query) { queryChars.Add(c); } // Map from keys (entry positions) to # of query chars found in entry Dictionary <int, int> posToCountSimp = new Dictionary <int, int>(); Dictionary <int, int> posToCountTrad = new Dictionary <int, int>(); // Look at each character's entry position vector, increment counts foreach (char c in queryChars) { // If there's a hanzi that's not in index, we'll sure have not hits! if (!index.IdeoIndex.ContainsKey(c)) { return(new List <CedictResult>()); } IdeoIndexItem iii = index.IdeoIndex[c]; // Count separately for simplified and traditional foreach (var iep in iii.EntriesHeadwordSimp) { if (posToCountSimp.ContainsKey(iep.EntryIdx)) { ++posToCountSimp[iep.EntryIdx]; } else { posToCountSimp[iep.EntryIdx] = 1; } } foreach (var iep in iii.EntriesHeadwordTrad) { if (posToCountTrad.ContainsKey(iep.EntryIdx)) { ++posToCountTrad[iep.EntryIdx]; } else { posToCountTrad[iep.EntryIdx] = 1; } } } // Get positions that contain all chars from query HashSet <int> matchingPositions = new HashSet <int>(); foreach (var x in posToCountSimp) { if (x.Value == queryChars.Count) { matchingPositions.Add(x.Key); } } foreach (var x in posToCountTrad) { if (x.Value == queryChars.Count) { matchingPositions.Add(x.Key); } } // Now fetch and verify results List <ResWithEntry> resWE = doLoadVerifyHanzi(br, matchingPositions, query, script); // Sort hanzi results resWE.Sort((a, b) => hrComp(a, b)); // Done. List <CedictResult> res = new List <CedictResult>(resWE.Capacity); for (int i = 0; i != resWE.Count; ++i) { res.Add(resWE[i].Res); } return(res); }