/// <summary> /// Retrieves matching entries for a target-language search expression. /// </summary> private List<CedictResult> doTargetLookup(BinReader br, string query) { // Empty query string: no results query = query.Trim(); if (query == string.Empty) return new List<CedictResult>(); // Tokenize query string HybridText txtQuery = new HybridText(query); ReadOnlyCollection<EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery); // Get query string's token IDs bool anyUnknown = false; HashSet<int> idSet = new HashSet<int>(); foreach (EquivToken eqt in txtTokenized) { if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho) { anyUnknown = true; break; } idSet.Add(eqt.TokenId); } // Any unknown tokens - no match, we know that immediately List<CedictResult> res = new List<CedictResult>(); if (anyUnknown) return res; // Collect IDs of tokenized senses that contain one or more of our query IDs Dictionary<int, SenseLookupInfo> senseTokenCounts = new Dictionary<int, SenseLookupInfo>(); bool firstToken = true; // For each token... foreach (int tokenId in idSet) { // Get sense instances where it occurs List<SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br); foreach (SenseInfo si in instances) { SenseLookupInfo sli; // We already have a count for this token ID if (senseTokenCounts.ContainsKey(si.TokenizedSenseId)) ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense; // Or this is the first time we're seeing it // We only record counts for the first token // We're looking for senses that contain *all* query tokens else if (firstToken) { sli = new SenseLookupInfo { NumOfQueryTokensInSense = 0, TokensInSense = si.TokensInSense }; senseTokenCounts[si.TokenizedSenseId] = sli; ++sli.NumOfQueryTokensInSense; } } firstToken = false; } // Keep those sense IDs (positions) that contain all of our query tokens // We already eliminated some candidates through "firstToken" trick before, but not all List<int> sensePosList = new List<int>(); foreach (var x in senseTokenCounts) { if (x.Value.NumOfQueryTokensInSense == idSet.Count) sensePosList.Add(x.Key); } // Load each tokenized sense to find out: // - whether entry is a real match // - entry ID // - best score for entry (multiple senses may hold query string) // - highlights Dictionary<int, EntryMatchInfo> entryIdToInfo = new Dictionary<int, EntryMatchInfo>(); foreach (int senseId in sensePosList) doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br); // Sort entry IDs by their best score // Drop entries with unprintable hanzi in HW now List<EntryMatchInfo> entryInfoList = new List<EntryMatchInfo>(); foreach (var x in entryIdToInfo) { // Check coverage. Because we don't load full entry, it's possible // that some unsupported chars in hybrid text of senses slip through. // There's a limit to perfectionism. string simp, trad; br.Position = x.Value.EntryId; CedictEntry.DeserializeHanzi(br, out simp, out trad); if (!areHanziCovered(simp, trad)) continue; // Queue up for sorting. entryInfoList.Add(x.Value); } entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore)); // Load entries, wrap into results foreach (EntryMatchInfo emi in entryInfoList) { CedictResult cr = new CedictResult(emi.EntryId, new ReadOnlyCollection<CedictTargetHighlight>(emi.TargetHilites)); res.Add(cr); } return res; }
/// <summary> /// Retrieves matching entries for a target-language search expression. /// </summary> private List <CedictResult> doTargetLookup(BinReader br, string query) { // Empty query string: no results query = query.Trim(); if (query == string.Empty) { return(new List <CedictResult>()); } // Tokenize query string HybridText txtQuery = new HybridText(query); ReadOnlyCollection <EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery); // Get query string's token IDs bool anyUnknown = false; HashSet <int> idSet = new HashSet <int>(); foreach (EquivToken eqt in txtTokenized) { if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho) { anyUnknown = true; break; } idSet.Add(eqt.TokenId); } // Any unknown tokens - no match, we know that immediately List <CedictResult> res = new List <CedictResult>(); if (anyUnknown) { return(res); } // Collect IDs of tokenized senses that contain one or more of our query IDs Dictionary <int, SenseLookupInfo> senseTokenCounts = new Dictionary <int, SenseLookupInfo>(); bool firstToken = true; // For each token... foreach (int tokenId in idSet) { // Get sense instances where it occurs List <SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br); foreach (SenseInfo si in instances) { SenseLookupInfo sli; // We already have a count for this token ID if (senseTokenCounts.ContainsKey(si.TokenizedSenseId)) { ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense; } // Or this is the first time we're seeing it // We only record counts for the first token // We're looking for senses that contain *all* query tokens else if (firstToken) { sli = new SenseLookupInfo { NumOfQueryTokensInSense = 0, TokensInSense = si.TokensInSense }; senseTokenCounts[si.TokenizedSenseId] = sli; ++sli.NumOfQueryTokensInSense; } } firstToken = false; } // Keep those sense IDs (positions) that contain all of our query tokens // We already eliminated some candidates through "firstToken" trick before, but not all List <int> sensePosList = new List <int>(); foreach (var x in senseTokenCounts) { if (x.Value.NumOfQueryTokensInSense == idSet.Count) { sensePosList.Add(x.Key); } } // Load each tokenized sense to find out: // - whether entry is a real match // - entry ID // - best score for entry (multiple senses may hold query string) // - highlights Dictionary <int, EntryMatchInfo> entryIdToInfo = new Dictionary <int, EntryMatchInfo>(); foreach (int senseId in sensePosList) { doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br); } // Sort entry IDs by their best score // Drop entries with unprintable hanzi in HW now List <EntryMatchInfo> entryInfoList = new List <EntryMatchInfo>(); foreach (var x in entryIdToInfo) { // Check coverage. Because we don't load full entry, it's possible // that some unsupported chars in hybrid text of senses slip through. // There's a limit to perfectionism. string simp, trad; br.Position = x.Value.EntryId; CedictEntry.DeserializeHanzi(br, out simp, out trad); if (!areHanziCovered(simp, trad)) { continue; } // Queue up for sorting. entryInfoList.Add(x.Value); } entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore)); // Load entries, wrap into results foreach (EntryMatchInfo emi in entryInfoList) { CedictResult cr = new CedictResult(emi.EntryId, new ReadOnlyCollection <CedictTargetHighlight>(emi.TargetHilites)); res.Add(cr); } return(res); }