/// <summary> /// Ctor: deserializes binary data. /// </summary> public Index(BinReader br) { WordHolder = new CedictEngine.WordHolder(br); SenseIndex = new Dictionary <int, SenseIndexItem>(); int senseIndexKeyCount = br.ReadInt(); for (int i = 0; i != senseIndexKeyCount; ++i) { int tokenId = br.ReadInt(); SenseIndexItem sii = new SenseIndexItem(br); SenseIndex[tokenId] = sii; } IdeoIndex = new Dictionary <char, IdeoIndexItem>(); PinyinIndex = new Dictionary <string, PinyinIndexItem>(); int ideoIndexKeyCount = br.ReadInt(); for (int i = 0; i != ideoIndexKeyCount; ++i) { char c = br.ReadChar(); IdeoIndexItem iii = new IdeoIndexItem(br); IdeoIndex[c] = iii; } int pinyinIndexKeyCount = br.ReadInt(); for (int i = 0; i != pinyinIndexKeyCount; ++i) { string str = br.ReadString(); PinyinIndexItem pyi = new PinyinIndexItem(br); PinyinIndex[str] = pyi; } }
/// <summary> /// Ctor: deserializes binary data. /// </summary> public Index(BinReader br) { WordHolder = new CedictEngine.WordHolder(br); SenseIndex = new Dictionary<int, SenseIndexItem>(); int senseIndexKeyCount = br.ReadInt(); for (int i = 0; i != senseIndexKeyCount; ++i) { int tokenId = br.ReadInt(); SenseIndexItem sii = new SenseIndexItem(br); SenseIndex[tokenId] = sii; } IdeoIndex = new Dictionary<char, IdeoIndexItem>(); PinyinIndex = new Dictionary<string, PinyinIndexItem>(); int ideoIndexKeyCount = br.ReadInt(); for (int i = 0; i != ideoIndexKeyCount; ++i) { char c = br.ReadChar(); IdeoIndexItem iii = new IdeoIndexItem(br); IdeoIndex[c] = iii; } int pinyinIndexKeyCount = br.ReadInt(); for (int i = 0; i != pinyinIndexKeyCount; ++i) { string str = br.ReadString(); PinyinIndexItem pyi = new PinyinIndexItem(br); PinyinIndex[str] = pyi; } }
/// <summary> /// Indexes one parsed Cedict entry (hanzi, pinyin and target-language indexes). /// </summary> private void indexEntry(CedictEntry entry, int id) { // Collect different chars in both headwords HashSet <char> simpSet = new HashSet <char>(); foreach (char c in entry.ChSimpl) { simpSet.Add(c); } if (simpSet.Count > byte.MaxValue) { throw new Exception("Simplified headword too long; max: 255."); } byte simpCount = (byte)simpSet.Count; HashSet <char> tradSet = new HashSet <char>(); foreach (char c in entry.ChTrad) { tradSet.Add(c); } if (tradSet.Count > byte.MaxValue) { throw new Exception("Traditional headword too long; max: 255."); } byte tradCount = (byte)tradSet.Count; // Index character of simplified headword foreach (char c in simpSet) { IdeoIndexItem ii; if (index.IdeoIndex.ContainsKey(c)) { ii = index.IdeoIndex[c]; } else { ii = new IdeoIndexItem(); index.IdeoIndex[c] = ii; } ii.EntriesHeadwordSimp.Add(new IdeoEntryPtr { EntryIdx = id, HwCharCount = simpCount }); } // Index characters of traditional headword foreach (char c in tradSet) { IdeoIndexItem ii; if (index.IdeoIndex.ContainsKey(c)) { ii = index.IdeoIndex[c]; } else { ii = new IdeoIndexItem(); index.IdeoIndex[c] = ii; } ii.EntriesHeadwordTrad.Add(new IdeoEntryPtr { EntryIdx = id, HwCharCount = tradCount }); } // Index pinyin syllables foreach (PinyinSyllable pys in entry.Pinyin) { PinyinIndexItem pi; // Index contains lower-case syllables string textLo = pys.Text.ToLowerInvariant(); if (index.PinyinIndex.ContainsKey(textLo)) { pi = index.PinyinIndex[textLo]; } else { pi = new PinyinIndexItem(); index.PinyinIndex[textLo] = pi; } // Figure out which list in index item - by tone List <int> entryList; if (pys.Tone == -1) { entryList = pi.EntriesNT; } else if (pys.Tone == 0) { entryList = pi.Entries0; } else if (pys.Tone == 1) { entryList = pi.Entries1; } else if (pys.Tone == 2) { entryList = pi.Entries2; } else if (pys.Tone == 3) { entryList = pi.Entries3; } else if (pys.Tone == 4) { entryList = pi.Entries4; } else { throw new Exception("Invalid tone: " + pys.Tone.ToString()); } // Avoid indexing same entry twice if a syllable occurs multiple times if (entryList.Count == 0 || entryList[entryList.Count - 1] != id) { entryList.Add(id); } } // Index equiv of each sense int senseIx = -1; foreach (CedictSense sense in entry.Senses) { ++senseIx; // Empty equiv: nothing to index if (sense.Equiv.IsEmpty) { continue; } // Tokenize ReadOnlyCollection <EquivToken> tokens = tokenizer.Tokenize(sense.Equiv); // Index sense indexSense(tokens, id, senseIx); } }
/// <summary> /// Indexes one parsed Cedict entry (hanzi, pinyin and target-language indexes). /// </summary> private void indexEntry(CedictEntry entry, int id) { // Index character of simplified headword foreach (char c in entry.ChSimpl) { IdeoIndexItem ii; if (index.IdeoIndex.ContainsKey(c)) ii = index.IdeoIndex[c]; else { ii = new IdeoIndexItem(); index.IdeoIndex[c] = ii; } // Avoid indexing same entry twice if a char occurs multiple times if (ii.EntriesHeadwordSimp.Count == 0 || ii.EntriesHeadwordSimp[ii.EntriesHeadwordSimp.Count - 1] != id) ii.EntriesHeadwordSimp.Add(id); } // Index characters of traditional headword foreach (char c in entry.ChTrad) { IdeoIndexItem ii; if (index.IdeoIndex.ContainsKey(c)) ii = index.IdeoIndex[c]; else { ii = new IdeoIndexItem(); index.IdeoIndex[c] = ii; } // Avoid indexing same entry twice if a char occurs multiple times if (ii.EntriesHeadwordTrad.Count == 0 || ii.EntriesHeadwordTrad[ii.EntriesHeadwordTrad.Count - 1] != id) ii.EntriesHeadwordTrad.Add(id); } // Index pinyin syllables foreach (PinyinSyllable pys in entry.Pinyin) { PinyinIndexItem pi; // Index contains lower-case syllables string textLo = pys.Text.ToLowerInvariant(); if (index.PinyinIndex.ContainsKey(textLo)) pi = index.PinyinIndex[textLo]; else { pi = new PinyinIndexItem(); index.PinyinIndex[textLo] = pi; } // Figure out which list in index item - by tone List<int> entryList; if (pys.Tone == -1) entryList = pi.EntriesNT; else if (pys.Tone == 0) entryList = pi.Entries0; else if (pys.Tone == 1) entryList = pi.Entries1; else if (pys.Tone == 2) entryList = pi.Entries2; else if (pys.Tone == 3) entryList = pi.Entries3; else if (pys.Tone == 4) entryList = pi.Entries4; else throw new Exception("Invalid tone: " + pys.Tone.ToString()); // Avoid indexing same entry twice if a syllable occurs multiple times if (entryList.Count == 0 || entryList[entryList.Count - 1] != id) entryList.Add(id); } // Index equiv of each sense int senseIx = -1; foreach (CedictSense sense in entry.Senses) { ++senseIx; // Empty equiv: nothing to index if (sense.Equiv.IsEmpty) continue; // Tokenize ReadOnlyCollection<EquivToken> tokens = tokenizer.Tokenize(sense.Equiv); // Index sense indexSense(tokens, id, senseIx); } }
/// <summary> /// Retrieves entries (sorted) whose headword contains pinyin from search expression. /// </summary> List <CedictResult> doPinyinLookupHead(BinReader br, List <PinyinSyllable> sylls) { // Get every syllable once - we ignore repeats // If a syllable occurs with unspecified tone once, or if it occurs with multiple tone marks // -> We only take it as one item with unspecified tone // Otherwise, take it as is, with tone mark Dictionary <string, int> syllDict = new Dictionary <string, int>(); foreach (var syll in sylls) { if (!syllDict.ContainsKey(syll.Text)) { syllDict[syll.Text] = syll.Tone; } else if (syllDict[syll.Text] != syll.Tone) { syllDict[syll.Text] = -1; } } List <PinyinSyllable> querySylls = new List <PinyinSyllable>(); foreach (var x in syllDict) { querySylls.Add(new PinyinSyllable(x.Key, x.Value)); } // Map from keys (entry positions) to # of query syllables found in entry Dictionary <int, int> posToCount = new Dictionary <int, int>(); // Look at each query syllable, increment counts for entries in syllable's list(s) foreach (PinyinSyllable syll in querySylls) { // If this syllable is not index, we sure won't have any hits if (!index.PinyinIndex.ContainsKey(syll.Text)) { return(new List <CedictResult>()); } PinyinIndexItem pii = index.PinyinIndex[syll.Text]; // Query specifies a tone mark: just that list if (syll.Tone != -1) { List <int> instanceList; if (syll.Tone == 0) { instanceList = pii.Entries0; } else if (syll.Tone == 1) { instanceList = pii.Entries1; } else if (syll.Tone == 2) { instanceList = pii.Entries2; } else if (syll.Tone == 3) { instanceList = pii.Entries3; } else if (syll.Tone == 4) { instanceList = pii.Entries4; } else { throw new Exception("Invalid tone: " + syll.Tone.ToString()); } foreach (int pos in instanceList) { if (!posToCount.ContainsKey(pos)) { posToCount[pos] = 1; } else { ++posToCount[pos]; } } } // Query does not specify a tone mark // Get union of instance vectors, increment each position once else { HashSet <int> posSet = new HashSet <int>(); foreach (int pos in pii.Entries0) { posSet.Add(pos); } foreach (int pos in pii.Entries1) { posSet.Add(pos); } foreach (int pos in pii.Entries2) { posSet.Add(pos); } foreach (int pos in pii.Entries3) { posSet.Add(pos); } foreach (int pos in pii.Entries4) { posSet.Add(pos); } foreach (int pos in pii.EntriesNT) { posSet.Add(pos); } foreach (int pos in posSet) { if (!posToCount.ContainsKey(pos)) { posToCount[pos] = 1; } else { ++posToCount[pos]; } } } } // Get positions that contain all chars from query HashSet <int> matchingPositions = new HashSet <int>(); foreach (var x in posToCount) { if (x.Value == querySylls.Count) { matchingPositions.Add(x.Key); } } // Now fetch and verify results List <ResWithEntry> resWE = doLoadVerifyPinyin(br, matchingPositions, sylls); // Sort pinyin results resWE.Sort((a, b) => pyComp(a, b)); // Done. List <CedictResult> res = new List <CedictResult>(resWE.Capacity); for (int i = 0; i != resWE.Count; ++i) { res.Add(resWE[i].Res); } return(res); }