Ejemplo n.º 1
0
        /// <summary>
        /// Retrieves matching entries for a target-language search expression.
        /// </summary>
        private List<CedictResult> doTargetLookup(BinReader br, string query)
        {
            // Empty query string: no results
            query = query.Trim();
            if (query == string.Empty) return new List<CedictResult>();

            // Tokenize query string
            HybridText txtQuery = new HybridText(query);
            ReadOnlyCollection<EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery);
            // Get query string's token IDs
            bool anyUnknown = false;
            HashSet<int> idSet = new HashSet<int>();
            foreach (EquivToken eqt in txtTokenized)
            {
                if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho)
                { anyUnknown = true; break; }
                idSet.Add(eqt.TokenId);
            }
            // Any unknown tokens - no match, we know that immediately
            List<CedictResult> res = new List<CedictResult>();
            if (anyUnknown) return res;
            // Collect IDs of tokenized senses that contain one or more of our query IDs
            Dictionary<int, SenseLookupInfo> senseTokenCounts = new Dictionary<int, SenseLookupInfo>();
            bool firstToken = true;
            // For each token...
            foreach (int tokenId in idSet)
            {
                // Get sense instances where it occurs
                List<SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br);
                foreach (SenseInfo si in instances)
                {
                    SenseLookupInfo sli;
                    // We already have a count for this token ID
                    if (senseTokenCounts.ContainsKey(si.TokenizedSenseId))
                        ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense;
                    // Or this is the first time we're seeing it
                    // We only record counts for the first token
                    // We're looking for senses that contain *all* query tokens
                    else if (firstToken)
                    {
                        sli = new SenseLookupInfo
                        {
                            NumOfQueryTokensInSense = 0,
                            TokensInSense = si.TokensInSense
                        };
                        senseTokenCounts[si.TokenizedSenseId] = sli;
                        ++sli.NumOfQueryTokensInSense;
                    }
                }
                firstToken = false;
            }
            // Keep those sense IDs (positions) that contain all of our query tokens
            // We already eliminated some candidates through "firstToken" trick before, but not all
            List<int> sensePosList = new List<int>();
            foreach (var x in senseTokenCounts)
            {
                if (x.Value.NumOfQueryTokensInSense == idSet.Count)
                    sensePosList.Add(x.Key);
            }
            // Load each tokenized sense to find out:
            // - whether entry is a real match
            // - entry ID
            // - best score for entry (multiple senses may hold query string)
            // - highlights
            Dictionary<int, EntryMatchInfo> entryIdToInfo = new Dictionary<int, EntryMatchInfo>();
            foreach (int senseId in sensePosList)
                doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br);

            // Sort entry IDs by their best score
            // Drop entries with unprintable hanzi in HW now
            List<EntryMatchInfo> entryInfoList = new List<EntryMatchInfo>();
            foreach (var x in entryIdToInfo)
            {
                // Check coverage. Because we don't load full entry, it's possible
                // that some unsupported chars in hybrid text of senses slip through.
                // There's a limit to perfectionism.
                string simp, trad;
                br.Position = x.Value.EntryId;
                CedictEntry.DeserializeHanzi(br, out simp, out trad);
                if (!areHanziCovered(simp, trad)) continue;
                // Queue up for sorting.
                entryInfoList.Add(x.Value);
            }
            entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore));
            // Load entries, wrap into results
            foreach (EntryMatchInfo emi in entryInfoList)
            {
                CedictResult cr = new CedictResult(emi.EntryId,
                    new ReadOnlyCollection<CedictTargetHighlight>(emi.TargetHilites));
                res.Add(cr);
            }
            return res;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Ctor: takes data to display.
        /// </summary>
        /// <param name="owner">Zen control that owns me.</param>
        /// <param name="tprov">Localized display text provider.</param>
        /// <param name="lookupThroughLink">Delegate to call when user initiates lookup by clicking on a link.</param>
        /// <param name="getEntry">Delegate to call when an entry must be retrieved (for "copy" context menu).</param>
        /// <param name="entryProvider">Dictionary entry provider.</param>
        /// <param name="cr">The lookup result this control will show.</param>
        /// <param name="maxHeadLength">Longest headword in full results list.</param>
        /// <param name="script">Scripts to show in headword.</param>
        /// <param name="odd">Odd/even position in list, for alternating BG color.</param>
        public OneResultControl(ZenControlBase owner, float scale, ITextProvider tprov,
            LookupThroughLinkDelegate lookupThroughLink,
            ParentPaintDelegate parentPaint, GetEntryDelegate getEntry,
            ICedictEntryProvider entryProvider, CedictResult cr,
            SearchScript script, bool last)
            : base(owner)
        {
            this.scale = scale;
            this.tprov = tprov;
            this.lookupThroughLink = lookupThroughLink;
            this.parentPaint = parentPaint;
            this.getEntry = getEntry;
            this.entry = entryProvider.GetEntry(cr.EntryId);
            this.res = cr;
            this.analyzedScript = script;
            this.last = last;

            padLeft = (int)(5.0F * scale);
            padTop = (int)(4.0F * scale);
            padBottom = (int)(8.0F * scale);
            padMid = (int)(20.0F * scale);
            padRight = (int)(10.0F * scale);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Retrieves pinyin lookup candidates, verifies actual presence of search expression in headword.
        /// </summary>
        List<ResWithEntry> doLoadVerifyPinyin(BinReader br, IEnumerable<int> poss, List<PinyinSyllable> sylls)
        {
            List<ResWithEntry> resList = new List<ResWithEntry>();
            // Yes, we only open our file on-demand
            // But we do this within each lookup's scope, so lookup stays thread-safe
            // Look at each entry: load, verify, keep or drop
            foreach (int pos in poss)
            {
                // Load up entry from file
                br.Position = pos;
                CedictEntry entry = new CedictEntry(br);

                // Find query syllables in entry
                int syllStart = -1;
                for (int i = 0; i <= entry.PinyinCount - sylls.Count; ++i)
                {
                    int j;
                    for (j = 0; j != sylls.Count; ++j)
                    {
                        PinyinSyllable syllEntry = entry.GetPinyinAt(i + j);
                        PinyinSyllable syllQuery = sylls[j];
                        if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text) break;
                        if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone) break;
                    }
                    if (j == sylls.Count)
                    {
                        syllStart = i;
                        break;
                    }
                }
                // Entry is a keeper if query syllables found
                if (syllStart == -1) continue;

                // Drop if there's any unprintable Hanzi
                if (!areHanziCovered(entry)) continue;

                // Keeper!
                CedictResult res = new CedictResult(pos, entry.HanziPinyinMap, syllStart, sylls.Count);
                ResWithEntry resWE = new ResWithEntry(res, entry);
                resList.Add(resWE);
            }
            return resList;
        }
Ejemplo n.º 4
0
 public ResWithEntry(CedictResult res, CedictEntry entry)
 {
     Res = res;
     Entry = entry;
 }
Ejemplo n.º 5
0
        /// <summary>
        /// Retrieves hanzi lookup candidates, verifies actual presence of search expression in headword.
        /// </summary>
        List<ResWithEntry> doLoadVerifyHanzi(BinReader br, IEnumerable<int> poss, string query, SearchScript script)
        {
            List<ResWithEntry> resList = new List<ResWithEntry>();
            // Yes, we only open our file on-demand
            // But we do this within each lookup's scope, so lookup stays thread-safe
            // Look at each entry: load, verify, keep or drop
            foreach (int pos in poss)
            {
                // Load up entry from file
                br.Position = pos;
                CedictEntry entry = new CedictEntry(br);

                // Figure out position/length of query string in simplified and traditional headwords
                int hiliteStart = -1;
                int hiliteLength = 0;
                hiliteStart = entry.ChSimpl.IndexOf(query);
                if (hiliteStart != -1) hiliteLength = query.Length;
                // If not found in simplified, check in traditional
                if (hiliteLength == 0)
                {
                    hiliteStart = entry.ChTrad.IndexOf(query);
                    if (hiliteStart != -1) hiliteLength = query.Length;
                }
                // Entry is a keeper if either source or target headword contains query
                if (hiliteLength != 0)
                {
                    // Drop if there's any unprintable hanzi
                    if (!areHanziCovered(entry)) continue;

                    // TO-DO: indicate wrong script in result
                    CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None,
                        pos, entry.HanziPinyinMap,
                        hiliteStart, hiliteLength);
                    ResWithEntry resWE = new ResWithEntry(res, entry);
                    resList.Add(resWE);
                }
            }
            return resList;
        }