Example #1
0
        /// <summary>
        /// Retrieves matching entries for a target-language search expression.
        /// </summary>
        private List<CedictResult> doTargetLookup(BinReader br, string query)
        {
            // Empty query string: no results
            query = query.Trim();
            if (query == string.Empty) return new List<CedictResult>();

            // Tokenize query string
            HybridText txtQuery = new HybridText(query);
            ReadOnlyCollection<EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery);
            // Get query string's token IDs
            bool anyUnknown = false;
            HashSet<int> idSet = new HashSet<int>();
            foreach (EquivToken eqt in txtTokenized)
            {
                if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho)
                { anyUnknown = true; break; }
                idSet.Add(eqt.TokenId);
            }
            // Any unknown tokens - no match, we know that immediately
            List<CedictResult> res = new List<CedictResult>();
            if (anyUnknown) return res;
            // Collect IDs of tokenized senses that contain one or more of our query IDs
            Dictionary<int, SenseLookupInfo> senseTokenCounts = new Dictionary<int, SenseLookupInfo>();
            bool firstToken = true;
            // For each token...
            foreach (int tokenId in idSet)
            {
                // Get sense instances where it occurs
                List<SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br);
                foreach (SenseInfo si in instances)
                {
                    SenseLookupInfo sli;
                    // We already have a count for this token ID
                    if (senseTokenCounts.ContainsKey(si.TokenizedSenseId))
                        ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense;
                    // Or this is the first time we're seeing it
                    // We only record counts for the first token
                    // We're looking for senses that contain *all* query tokens
                    else if (firstToken)
                    {
                        sli = new SenseLookupInfo
                        {
                            NumOfQueryTokensInSense = 0,
                            TokensInSense = si.TokensInSense
                        };
                        senseTokenCounts[si.TokenizedSenseId] = sli;
                        ++sli.NumOfQueryTokensInSense;
                    }
                }
                firstToken = false;
            }
            // Keep those sense IDs (positions) that contain all of our query tokens
            // We already eliminated some candidates through "firstToken" trick before, but not all
            List<int> sensePosList = new List<int>();
            foreach (var x in senseTokenCounts)
            {
                if (x.Value.NumOfQueryTokensInSense == idSet.Count)
                    sensePosList.Add(x.Key);
            }
            // Load each tokenized sense to find out:
            // - whether entry is a real match
            // - entry ID
            // - best score for entry (multiple senses may hold query string)
            // - highlights
            Dictionary<int, EntryMatchInfo> entryIdToInfo = new Dictionary<int, EntryMatchInfo>();
            foreach (int senseId in sensePosList)
                doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br);

            // Sort entry IDs by their best score
            // Drop entries with unprintable hanzi in HW now
            List<EntryMatchInfo> entryInfoList = new List<EntryMatchInfo>();
            foreach (var x in entryIdToInfo)
            {
                // Check coverage. Because we don't load full entry, it's possible
                // that some unsupported chars in hybrid text of senses slip through.
                // There's a limit to perfectionism.
                string simp, trad;
                br.Position = x.Value.EntryId;
                CedictEntry.DeserializeHanzi(br, out simp, out trad);
                if (!areHanziCovered(simp, trad)) continue;
                // Queue up for sorting.
                entryInfoList.Add(x.Value);
            }
            entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore));
            // Load entries, wrap into results
            foreach (EntryMatchInfo emi in entryInfoList)
            {
                CedictResult cr = new CedictResult(emi.EntryId,
                    new ReadOnlyCollection<CedictTargetHighlight>(emi.TargetHilites));
                res.Add(cr);
            }
            return res;
        }
Example #2
0
        /// <summary>
        /// Retrieves matching entries for a target-language search expression.
        /// </summary>
        private List <CedictResult> doTargetLookup(BinReader br, string query)
        {
            // Empty query string: no results
            query = query.Trim();
            if (query == string.Empty)
            {
                return(new List <CedictResult>());
            }

            // Tokenize query string
            HybridText txtQuery = new HybridText(query);
            ReadOnlyCollection <EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery);
            // Get query string's token IDs
            bool          anyUnknown = false;
            HashSet <int> idSet      = new HashSet <int>();

            foreach (EquivToken eqt in txtTokenized)
            {
                if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho)
                {
                    anyUnknown = true; break;
                }
                idSet.Add(eqt.TokenId);
            }
            // Any unknown tokens - no match, we know that immediately
            List <CedictResult> res = new List <CedictResult>();

            if (anyUnknown)
            {
                return(res);
            }
            // Collect IDs of tokenized senses that contain one or more of our query IDs
            Dictionary <int, SenseLookupInfo> senseTokenCounts = new Dictionary <int, SenseLookupInfo>();
            bool firstToken = true;

            // For each token...
            foreach (int tokenId in idSet)
            {
                // Get sense instances where it occurs
                List <SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br);
                foreach (SenseInfo si in instances)
                {
                    SenseLookupInfo sli;
                    // We already have a count for this token ID
                    if (senseTokenCounts.ContainsKey(si.TokenizedSenseId))
                    {
                        ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense;
                    }
                    // Or this is the first time we're seeing it
                    // We only record counts for the first token
                    // We're looking for senses that contain *all* query tokens
                    else if (firstToken)
                    {
                        sli = new SenseLookupInfo
                        {
                            NumOfQueryTokensInSense = 0,
                            TokensInSense           = si.TokensInSense
                        };
                        senseTokenCounts[si.TokenizedSenseId] = sli;
                        ++sli.NumOfQueryTokensInSense;
                    }
                }
                firstToken = false;
            }
            // Keep those sense IDs (positions) that contain all of our query tokens
            // We already eliminated some candidates through "firstToken" trick before, but not all
            List <int> sensePosList = new List <int>();

            foreach (var x in senseTokenCounts)
            {
                if (x.Value.NumOfQueryTokensInSense == idSet.Count)
                {
                    sensePosList.Add(x.Key);
                }
            }
            // Load each tokenized sense to find out:
            // - whether entry is a real match
            // - entry ID
            // - best score for entry (multiple senses may hold query string)
            // - highlights
            Dictionary <int, EntryMatchInfo> entryIdToInfo = new Dictionary <int, EntryMatchInfo>();

            foreach (int senseId in sensePosList)
            {
                doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br);
            }

            // Sort entry IDs by their best score
            // Drop entries with unprintable hanzi in HW now
            List <EntryMatchInfo> entryInfoList = new List <EntryMatchInfo>();

            foreach (var x in entryIdToInfo)
            {
                // Check coverage. Because we don't load full entry, it's possible
                // that some unsupported chars in hybrid text of senses slip through.
                // There's a limit to perfectionism.
                string simp, trad;
                br.Position = x.Value.EntryId;
                CedictEntry.DeserializeHanzi(br, out simp, out trad);
                if (!areHanziCovered(simp, trad))
                {
                    continue;
                }
                // Queue up for sorting.
                entryInfoList.Add(x.Value);
            }
            entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore));
            // Load entries, wrap into results
            foreach (EntryMatchInfo emi in entryInfoList)
            {
                CedictResult cr = new CedictResult(emi.EntryId,
                                                   new ReadOnlyCollection <CedictTargetHighlight>(emi.TargetHilites));
                res.Add(cr);
            }
            return(res);
        }