Ejemplo n.º 1
0
            private static int hrComp(ResWithEntry a, ResWithEntry b)
            {
                // First come those where match starts sooner
                int startCmp = a.Res.HanziHiliteStart.CompareTo(b.Res.HanziHiliteStart);

                if (startCmp != 0)
                {
                    return(startCmp);
                }
                // Then, pinyin lexical compare up to shorter's length
                int pyComp = a.Entry.PinyinCompare(b.Entry);

                if (pyComp != 0)
                {
                    return(pyComp);
                }
                // Pinyin is identical: shorter comes first
                int lengthCmp = a.Entry.ChSimpl.Length.CompareTo(b.Entry.ChSimpl.Length);

                return(lengthCmp);


                //// Shorter entry comes first
                //int lengthCmp = a.Entry.ChSimpl.Length.CompareTo(b.Entry.ChSimpl.Length);
                //if (lengthCmp != 0) return lengthCmp;
                //// Between equally long headwords where match starts sooner comes first
                //int startCmp = a.Res.HanziHiliteStart.CompareTo(b.Res.HanziHiliteStart);
                //if (startCmp != 0) return startCmp;
                //// Order equally long entries by pinyin lexicographical order
                //return a.Entry.PinyinCompare(b.Entry);
            }
Ejemplo n.º 2
0
            private void lookupPinyin(string query,
                                      EntryProvider ep, List <CedictResult> res)
            {
                // Interpret query string
                List <PinyinSyllable> qsylls, qnorm;

                interpretPinyin(query, out qsylls, out qnorm);
                // Get instance vectors
                Dictionary <string, HashSet <int> > candsBySyll = getPinyinCandidates(qnorm);
                // Intersect candidates
                List <int> cands = intersectCandidates(candsBySyll);
                // Retrieve all candidates; verify on the fly
                List <ResWithEntry> rl = retrieveVerifyPinyin(cands, qsylls);

                // Sort pinyin results
                rl.Sort((a, b) => pyComp(a, b));
                // Done.
                res.Capacity = rl.Count;
                for (int i = 0; i != rl.Count; ++i)
                {
                    ResWithEntry rwe = rl[i];
                    res.Add(rwe.Res);
                    ep.AddEntry(rwe.Res.EntryId, rwe.Entry);
                }
            }
Ejemplo n.º 3
0
        /// <summary>
        /// Retrieves pinyin lookup candidates, verifies actual presence of search expression in headword.
        /// </summary>
        List <ResWithEntry> doLoadVerifyPinyin(BinReader br, IEnumerable <int> poss, List <PinyinSyllable> sylls)
        {
            List <ResWithEntry> resList = new List <ResWithEntry>();

            // Yes, we only open our file on-demand
            // But we do this within each lookup's scope, so lookup stays thread-safe
            // Look at each entry: load, verify, keep or drop
            foreach (int pos in poss)
            {
                // Load up entry from file
                br.Position = pos;
                CedictEntry entry = new CedictEntry(br);

                // Find query syllables in entry
                int syllStart = -1;
                for (int i = 0; i <= entry.PinyinCount - sylls.Count; ++i)
                {
                    int j;
                    for (j = 0; j != sylls.Count; ++j)
                    {
                        PinyinSyllable syllEntry = entry.GetPinyinAt(i + j);
                        PinyinSyllable syllQuery = sylls[j];
                        if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text)
                        {
                            break;
                        }
                        if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone)
                        {
                            break;
                        }
                    }
                    if (j == sylls.Count)
                    {
                        syllStart = i;
                        break;
                    }
                }
                // Entry is a keeper if query syllables found
                if (syllStart == -1)
                {
                    continue;
                }

                // Drop if there's any unprintable Hanzi
                if (!areHanziCovered(entry))
                {
                    continue;
                }

                // Keeper!
                CedictResult res   = new CedictResult(pos, entry.HanziPinyinMap, syllStart, sylls.Count);
                ResWithEntry resWE = new ResWithEntry(res, entry);
                resList.Add(resWE);
            }
            return(resList);
        }
Ejemplo n.º 4
0
            private void lookupHanzi(string query, EntryProvider ep,
                                     List <CedictResult> res, List <CedictAnnotation> anns)
            {
                // Distinct Hanzi
                query = query.ToUpperInvariant();
                query = query.Trim();
                query = query.Replace(" ", "");
                HashSet <char> qhanzi = new HashSet <char>();

                foreach (char c in query)
                {
                    qhanzi.Add(c);
                }
                // Get instance vectors
                Dictionary <char, HashSet <int> > candsBySimp = new Dictionary <char, HashSet <int> >();
                Dictionary <char, HashSet <int> > candsByTrad = new Dictionary <char, HashSet <int> >();

                if (!getHanziCandidates(qhanzi, candsBySimp, candsByTrad))
                {
                    // If at least one Hanzi doesn't occur in any HW: we're done.
                    return;
                }
                // Intersect candidates
                HashSet <int> candsSimp = intersectCandidates(candsBySimp);
                HashSet <int> candsTrad = intersectCandidates(candsByTrad);
                // Take union
                HashSet <int> cands = new HashSet <int>();

                foreach (int i in candsSimp)
                {
                    cands.Add(i);
                }
                foreach (int i in candsTrad)
                {
                    cands.Add(i);
                }
                // Retrieve all candidates; verify on the fly
                List <ResWithEntry> rl = retrieveVerifyHanzi(cands, query);

                // Sort Hanzi results
                rl.Sort((a, b) => hrComp(a, b));
                // Done.
                res.Capacity = rl.Count;
                for (int i = 0; i != rl.Count; ++i)
                {
                    ResWithEntry rwe = rl[i];
                    res.Add(rwe.Res);
                    ep.AddEntry(rwe.Res.EntryId, rwe.Entry);
                }
            }
Ejemplo n.º 5
0
        /// <summary>
        /// Retrieves hanzi lookup candidates, verifies actual presence of search expression in headword.
        /// </summary>
        List <ResWithEntry> doLoadVerifyHanzi(BinReader br, IEnumerable <int> poss, string query, SearchScript script)
        {
            List <ResWithEntry> resList = new List <ResWithEntry>();

            // Yes, we only open our file on-demand
            // But we do this within each lookup's scope, so lookup stays thread-safe
            // Look at each entry: load, verify, keep or drop
            foreach (int pos in poss)
            {
                // Load up entry from file
                br.Position = pos;
                CedictEntry entry = new CedictEntry(br);

                // Figure out position/length of query string in simplified and traditional headwords
                int hiliteStart  = -1;
                int hiliteLength = 0;
                hiliteStart = entry.ChSimpl.IndexOf(query);
                if (hiliteStart != -1)
                {
                    hiliteLength = query.Length;
                }
                // If not found in simplified, check in traditional
                if (hiliteLength == 0)
                {
                    hiliteStart = entry.ChTrad.IndexOf(query);
                    if (hiliteStart != -1)
                    {
                        hiliteLength = query.Length;
                    }
                }
                // Entry is a keeper if either source or target headword contains query
                if (hiliteLength != 0)
                {
                    // Drop if there's any unprintable hanzi
                    if (!areHanziCovered(entry))
                    {
                        continue;
                    }

                    // TO-DO: indicate wrong script in result
                    CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None,
                                                        pos, entry.HanziPinyinMap,
                                                        hiliteStart, hiliteLength);
                    ResWithEntry resWE = new ResWithEntry(res, entry);
                    resList.Add(resWE);
                }
            }
            return(resList);
        }
Ejemplo n.º 6
0
            private List <ResWithEntry> retrieveVerifyPinyin(List <int> cands, List <PinyinSyllable> qsylls)
            {
                List <ResWithEntry> resList = new List <ResWithEntry>();

                foreach (int blobId in cands)
                {
                    // Load entry from DB
                    CedictEntry entry = loadFromBlob(blobId);

                    // Find query syllables in entry
                    int syllStart = -1;
                    for (int i = 0; i <= entry.PinyinCount - qsylls.Count; ++i)
                    {
                        int j;
                        for (j = 0; j != qsylls.Count; ++j)
                        {
                            PinyinSyllable syllEntry = entry.GetPinyinAt(i + j);
                            PinyinSyllable syllQuery = qsylls[j];
                            if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text)
                            {
                                break;
                            }
                            if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone)
                            {
                                break;
                            }
                        }
                        if (j == qsylls.Count)
                        {
                            syllStart = i;
                            break;
                        }
                    }
                    // Entry is a keeper if query syllables found
                    if (syllStart == -1)
                    {
                        continue;
                    }

                    // Keeper!
                    CedictResult cres  = new CedictResult(blobId, entry.HanziPinyinMap, syllStart, qsylls.Count);
                    ResWithEntry resWE = new ResWithEntry(cres, entry);
                    resList.Add(resWE);
                }
                return(resList);
            }
Ejemplo n.º 7
0
            private static int pyComp(ResWithEntry a, ResWithEntry b)
            {
                // Shorter entry comes first
                int lengthCmp = a.Entry.PinyinCount.CompareTo(b.Entry.PinyinCount);

                if (lengthCmp != 0)
                {
                    return(lengthCmp);
                }
                // Between equally long headwords where match starts sooner comes first
                int startCmp = a.Res.PinyinHiliteStart.CompareTo(b.Res.PinyinHiliteStart);

                if (startCmp != 0)
                {
                    return(startCmp);
                }
                // Order equally long entries by pinyin lexicographical order
                return(a.Entry.PinyinCompare(b.Entry));
            }
Ejemplo n.º 8
0
            private List <ResWithEntry> retrieveVerifyHanzi(HashSet <int> cands, string query)
            {
                List <ResWithEntry> resList = new List <ResWithEntry>();

                foreach (int blobId in cands)
                {
                    // Load entry from DB
                    CedictEntry entry = loadFromBlob(blobId);

                    // Figure out position/length of query string in simplified and traditional headwords
                    int hiliteStart  = -1;
                    int hiliteLength = 0;
                    hiliteStart = entry.ChSimpl.IndexOf(query);
                    if (hiliteStart != -1)
                    {
                        hiliteLength = query.Length;
                    }
                    // If not found in simplified, check in traditional
                    if (hiliteLength == 0)
                    {
                        hiliteStart = entry.ChTrad.IndexOf(query);
                        if (hiliteStart != -1)
                        {
                            hiliteLength = query.Length;
                        }
                    }
                    // Entry is a keeper if either source or target headword contains query
                    if (hiliteLength != 0)
                    {
                        CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None,
                                                            blobId, entry.HanziPinyinMap,
                                                            hiliteStart, hiliteLength);
                        ResWithEntry resWE = new ResWithEntry(res, entry);
                        resList.Add(resWE);
                    }
                }
                return(resList);
            }
Ejemplo n.º 9
0
        /// <summary>
        /// Retrieves pinyin lookup candidates, verifies actual presence of search expression in headword.
        /// </summary>
        List<ResWithEntry> doLoadVerifyPinyin(BinReader br, IEnumerable<int> poss, List<PinyinSyllable> sylls)
        {
            List<ResWithEntry> resList = new List<ResWithEntry>();
            // Yes, we only open our file on-demand
            // But we do this within each lookup's scope, so lookup stays thread-safe
            // Look at each entry: load, verify, keep or drop
            foreach (int pos in poss)
            {
                // Load up entry from file
                br.Position = pos;
                CedictEntry entry = new CedictEntry(br);

                // Find query syllables in entry
                int syllStart = -1;
                for (int i = 0; i <= entry.PinyinCount - sylls.Count; ++i)
                {
                    int j;
                    for (j = 0; j != sylls.Count; ++j)
                    {
                        PinyinSyllable syllEntry = entry.GetPinyinAt(i + j);
                        PinyinSyllable syllQuery = sylls[j];
                        if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text) break;
                        if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone) break;
                    }
                    if (j == sylls.Count)
                    {
                        syllStart = i;
                        break;
                    }
                }
                // Entry is a keeper if query syllables found
                if (syllStart == -1) continue;

                // Drop if there's any unprintable Hanzi
                if (!areHanziCovered(entry)) continue;

                // Keeper!
                CedictResult res = new CedictResult(pos, entry.HanziPinyinMap, syllStart, sylls.Count);
                ResWithEntry resWE = new ResWithEntry(res, entry);
                resList.Add(resWE);
            }
            return resList;
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Retrieves hanzi lookup candidates, verifies actual presence of search expression in headword.
        /// </summary>
        List<ResWithEntry> doLoadVerifyHanzi(BinReader br, IEnumerable<int> poss, string query, SearchScript script)
        {
            List<ResWithEntry> resList = new List<ResWithEntry>();
            // Yes, we only open our file on-demand
            // But we do this within each lookup's scope, so lookup stays thread-safe
            // Look at each entry: load, verify, keep or drop
            foreach (int pos in poss)
            {
                // Load up entry from file
                br.Position = pos;
                CedictEntry entry = new CedictEntry(br);

                // Figure out position/length of query string in simplified and traditional headwords
                int hiliteStart = -1;
                int hiliteLength = 0;
                hiliteStart = entry.ChSimpl.IndexOf(query);
                if (hiliteStart != -1) hiliteLength = query.Length;
                // If not found in simplified, check in traditional
                if (hiliteLength == 0)
                {
                    hiliteStart = entry.ChTrad.IndexOf(query);
                    if (hiliteStart != -1) hiliteLength = query.Length;
                }
                // Entry is a keeper if either source or target headword contains query
                if (hiliteLength != 0)
                {
                    // Drop if there's any unprintable hanzi
                    if (!areHanziCovered(entry)) continue;

                    // TO-DO: indicate wrong script in result
                    CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None,
                        pos, entry.HanziPinyinMap,
                        hiliteStart, hiliteLength);
                    ResWithEntry resWE = new ResWithEntry(res, entry);
                    resList.Add(resWE);
                }
            }
            return resList;
        }
Ejemplo n.º 11
0
 /// <summary>
 /// Compares lookup results after pinyin lookup for sorted presentation.
 /// </summary>
 private static int pyComp(ResWithEntry a, ResWithEntry b)
 {
     // Shorter entry comes first
     int lengthCmp = a.Entry.PinyinCount.CompareTo(b.Entry.PinyinCount);
     if (lengthCmp != 0) return lengthCmp;
     // Between equally long headwords where match starts sooner comes first
     int startCmp = a.Res.PinyinHiliteStart.CompareTo(b.Res.PinyinHiliteStart);
     if (startCmp != 0) return startCmp;
     // Order equally long entries by pinyin lexicographical order
     return a.Entry.PinyinCompare(b.Entry);
 }