/// <summary> /// Retrieves matching entries for a target-language search expression. /// </summary> private List<CedictResult> doTargetLookup(BinReader br, string query) { // Empty query string: no results query = query.Trim(); if (query == string.Empty) return new List<CedictResult>(); // Tokenize query string HybridText txtQuery = new HybridText(query); ReadOnlyCollection<EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery); // Get query string's token IDs bool anyUnknown = false; HashSet<int> idSet = new HashSet<int>(); foreach (EquivToken eqt in txtTokenized) { if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho) { anyUnknown = true; break; } idSet.Add(eqt.TokenId); } // Any unknown tokens - no match, we know that immediately List<CedictResult> res = new List<CedictResult>(); if (anyUnknown) return res; // Collect IDs of tokenized senses that contain one or more of our query IDs Dictionary<int, SenseLookupInfo> senseTokenCounts = new Dictionary<int, SenseLookupInfo>(); bool firstToken = true; // For each token... foreach (int tokenId in idSet) { // Get sense instances where it occurs List<SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br); foreach (SenseInfo si in instances) { SenseLookupInfo sli; // We already have a count for this token ID if (senseTokenCounts.ContainsKey(si.TokenizedSenseId)) ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense; // Or this is the first time we're seeing it // We only record counts for the first token // We're looking for senses that contain *all* query tokens else if (firstToken) { sli = new SenseLookupInfo { NumOfQueryTokensInSense = 0, TokensInSense = si.TokensInSense }; senseTokenCounts[si.TokenizedSenseId] = sli; ++sli.NumOfQueryTokensInSense; } } firstToken = false; } // Keep those sense IDs (positions) that contain all of our query tokens // We already eliminated some candidates through "firstToken" trick before, but not all List<int> sensePosList = new List<int>(); foreach (var x in senseTokenCounts) { if (x.Value.NumOfQueryTokensInSense == idSet.Count) sensePosList.Add(x.Key); } // Load each tokenized sense to find out: // - whether entry is a real match // - entry ID // - best score for entry (multiple senses may hold query string) // - highlights Dictionary<int, EntryMatchInfo> entryIdToInfo = new Dictionary<int, EntryMatchInfo>(); foreach (int senseId in sensePosList) doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br); // Sort entry IDs by their best score // Drop entries with unprintable hanzi in HW now List<EntryMatchInfo> entryInfoList = new List<EntryMatchInfo>(); foreach (var x in entryIdToInfo) { // Check coverage. Because we don't load full entry, it's possible // that some unsupported chars in hybrid text of senses slip through. // There's a limit to perfectionism. string simp, trad; br.Position = x.Value.EntryId; CedictEntry.DeserializeHanzi(br, out simp, out trad); if (!areHanziCovered(simp, trad)) continue; // Queue up for sorting. entryInfoList.Add(x.Value); } entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore)); // Load entries, wrap into results foreach (EntryMatchInfo emi in entryInfoList) { CedictResult cr = new CedictResult(emi.EntryId, new ReadOnlyCollection<CedictTargetHighlight>(emi.TargetHilites)); res.Add(cr); } return res; }
/// <summary> /// Ctor: takes data to display. /// </summary> /// <param name="owner">Zen control that owns me.</param> /// <param name="tprov">Localized display text provider.</param> /// <param name="lookupThroughLink">Delegate to call when user initiates lookup by clicking on a link.</param> /// <param name="getEntry">Delegate to call when an entry must be retrieved (for "copy" context menu).</param> /// <param name="entryProvider">Dictionary entry provider.</param> /// <param name="cr">The lookup result this control will show.</param> /// <param name="maxHeadLength">Longest headword in full results list.</param> /// <param name="script">Scripts to show in headword.</param> /// <param name="odd">Odd/even position in list, for alternating BG color.</param> public OneResultControl(ZenControlBase owner, float scale, ITextProvider tprov, LookupThroughLinkDelegate lookupThroughLink, ParentPaintDelegate parentPaint, GetEntryDelegate getEntry, ICedictEntryProvider entryProvider, CedictResult cr, SearchScript script, bool last) : base(owner) { this.scale = scale; this.tprov = tprov; this.lookupThroughLink = lookupThroughLink; this.parentPaint = parentPaint; this.getEntry = getEntry; this.entry = entryProvider.GetEntry(cr.EntryId); this.res = cr; this.analyzedScript = script; this.last = last; padLeft = (int)(5.0F * scale); padTop = (int)(4.0F * scale); padBottom = (int)(8.0F * scale); padMid = (int)(20.0F * scale); padRight = (int)(10.0F * scale); }
/// <summary> /// Retrieves pinyin lookup candidates, verifies actual presence of search expression in headword. /// </summary> List<ResWithEntry> doLoadVerifyPinyin(BinReader br, IEnumerable<int> poss, List<PinyinSyllable> sylls) { List<ResWithEntry> resList = new List<ResWithEntry>(); // Yes, we only open our file on-demand // But we do this within each lookup's scope, so lookup stays thread-safe // Look at each entry: load, verify, keep or drop foreach (int pos in poss) { // Load up entry from file br.Position = pos; CedictEntry entry = new CedictEntry(br); // Find query syllables in entry int syllStart = -1; for (int i = 0; i <= entry.PinyinCount - sylls.Count; ++i) { int j; for (j = 0; j != sylls.Count; ++j) { PinyinSyllable syllEntry = entry.GetPinyinAt(i + j); PinyinSyllable syllQuery = sylls[j]; if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text) break; if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone) break; } if (j == sylls.Count) { syllStart = i; break; } } // Entry is a keeper if query syllables found if (syllStart == -1) continue; // Drop if there's any unprintable Hanzi if (!areHanziCovered(entry)) continue; // Keeper! CedictResult res = new CedictResult(pos, entry.HanziPinyinMap, syllStart, sylls.Count); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } return resList; }
public ResWithEntry(CedictResult res, CedictEntry entry) { Res = res; Entry = entry; }
/// <summary> /// Retrieves hanzi lookup candidates, verifies actual presence of search expression in headword. /// </summary> List<ResWithEntry> doLoadVerifyHanzi(BinReader br, IEnumerable<int> poss, string query, SearchScript script) { List<ResWithEntry> resList = new List<ResWithEntry>(); // Yes, we only open our file on-demand // But we do this within each lookup's scope, so lookup stays thread-safe // Look at each entry: load, verify, keep or drop foreach (int pos in poss) { // Load up entry from file br.Position = pos; CedictEntry entry = new CedictEntry(br); // Figure out position/length of query string in simplified and traditional headwords int hiliteStart = -1; int hiliteLength = 0; hiliteStart = entry.ChSimpl.IndexOf(query); if (hiliteStart != -1) hiliteLength = query.Length; // If not found in simplified, check in traditional if (hiliteLength == 0) { hiliteStart = entry.ChTrad.IndexOf(query); if (hiliteStart != -1) hiliteLength = query.Length; } // Entry is a keeper if either source or target headword contains query if (hiliteLength != 0) { // Drop if there's any unprintable hanzi if (!areHanziCovered(entry)) continue; // TO-DO: indicate wrong script in result CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None, pos, entry.HanziPinyinMap, hiliteStart, hiliteLength); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } } return resList; }