private static int hrComp(ResWithEntry a, ResWithEntry b) { // First come those where match starts sooner int startCmp = a.Res.HanziHiliteStart.CompareTo(b.Res.HanziHiliteStart); if (startCmp != 0) { return(startCmp); } // Then, pinyin lexical compare up to shorter's length int pyComp = a.Entry.PinyinCompare(b.Entry); if (pyComp != 0) { return(pyComp); } // Pinyin is identical: shorter comes first int lengthCmp = a.Entry.ChSimpl.Length.CompareTo(b.Entry.ChSimpl.Length); return(lengthCmp); //// Shorter entry comes first //int lengthCmp = a.Entry.ChSimpl.Length.CompareTo(b.Entry.ChSimpl.Length); //if (lengthCmp != 0) return lengthCmp; //// Between equally long headwords where match starts sooner comes first //int startCmp = a.Res.HanziHiliteStart.CompareTo(b.Res.HanziHiliteStart); //if (startCmp != 0) return startCmp; //// Order equally long entries by pinyin lexicographical order //return a.Entry.PinyinCompare(b.Entry); }
private void lookupPinyin(string query, EntryProvider ep, List <CedictResult> res) { // Interpret query string List <PinyinSyllable> qsylls, qnorm; interpretPinyin(query, out qsylls, out qnorm); // Get instance vectors Dictionary <string, HashSet <int> > candsBySyll = getPinyinCandidates(qnorm); // Intersect candidates List <int> cands = intersectCandidates(candsBySyll); // Retrieve all candidates; verify on the fly List <ResWithEntry> rl = retrieveVerifyPinyin(cands, qsylls); // Sort pinyin results rl.Sort((a, b) => pyComp(a, b)); // Done. res.Capacity = rl.Count; for (int i = 0; i != rl.Count; ++i) { ResWithEntry rwe = rl[i]; res.Add(rwe.Res); ep.AddEntry(rwe.Res.EntryId, rwe.Entry); } }
/// <summary> /// Retrieves pinyin lookup candidates, verifies actual presence of search expression in headword. /// </summary> List <ResWithEntry> doLoadVerifyPinyin(BinReader br, IEnumerable <int> poss, List <PinyinSyllable> sylls) { List <ResWithEntry> resList = new List <ResWithEntry>(); // Yes, we only open our file on-demand // But we do this within each lookup's scope, so lookup stays thread-safe // Look at each entry: load, verify, keep or drop foreach (int pos in poss) { // Load up entry from file br.Position = pos; CedictEntry entry = new CedictEntry(br); // Find query syllables in entry int syllStart = -1; for (int i = 0; i <= entry.PinyinCount - sylls.Count; ++i) { int j; for (j = 0; j != sylls.Count; ++j) { PinyinSyllable syllEntry = entry.GetPinyinAt(i + j); PinyinSyllable syllQuery = sylls[j]; if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text) { break; } if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone) { break; } } if (j == sylls.Count) { syllStart = i; break; } } // Entry is a keeper if query syllables found if (syllStart == -1) { continue; } // Drop if there's any unprintable Hanzi if (!areHanziCovered(entry)) { continue; } // Keeper! CedictResult res = new CedictResult(pos, entry.HanziPinyinMap, syllStart, sylls.Count); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } return(resList); }
private void lookupHanzi(string query, EntryProvider ep, List <CedictResult> res, List <CedictAnnotation> anns) { // Distinct Hanzi query = query.ToUpperInvariant(); query = query.Trim(); query = query.Replace(" ", ""); HashSet <char> qhanzi = new HashSet <char>(); foreach (char c in query) { qhanzi.Add(c); } // Get instance vectors Dictionary <char, HashSet <int> > candsBySimp = new Dictionary <char, HashSet <int> >(); Dictionary <char, HashSet <int> > candsByTrad = new Dictionary <char, HashSet <int> >(); if (!getHanziCandidates(qhanzi, candsBySimp, candsByTrad)) { // If at least one Hanzi doesn't occur in any HW: we're done. return; } // Intersect candidates HashSet <int> candsSimp = intersectCandidates(candsBySimp); HashSet <int> candsTrad = intersectCandidates(candsByTrad); // Take union HashSet <int> cands = new HashSet <int>(); foreach (int i in candsSimp) { cands.Add(i); } foreach (int i in candsTrad) { cands.Add(i); } // Retrieve all candidates; verify on the fly List <ResWithEntry> rl = retrieveVerifyHanzi(cands, query); // Sort Hanzi results rl.Sort((a, b) => hrComp(a, b)); // Done. res.Capacity = rl.Count; for (int i = 0; i != rl.Count; ++i) { ResWithEntry rwe = rl[i]; res.Add(rwe.Res); ep.AddEntry(rwe.Res.EntryId, rwe.Entry); } }
/// <summary> /// Retrieves hanzi lookup candidates, verifies actual presence of search expression in headword. /// </summary> List <ResWithEntry> doLoadVerifyHanzi(BinReader br, IEnumerable <int> poss, string query, SearchScript script) { List <ResWithEntry> resList = new List <ResWithEntry>(); // Yes, we only open our file on-demand // But we do this within each lookup's scope, so lookup stays thread-safe // Look at each entry: load, verify, keep or drop foreach (int pos in poss) { // Load up entry from file br.Position = pos; CedictEntry entry = new CedictEntry(br); // Figure out position/length of query string in simplified and traditional headwords int hiliteStart = -1; int hiliteLength = 0; hiliteStart = entry.ChSimpl.IndexOf(query); if (hiliteStart != -1) { hiliteLength = query.Length; } // If not found in simplified, check in traditional if (hiliteLength == 0) { hiliteStart = entry.ChTrad.IndexOf(query); if (hiliteStart != -1) { hiliteLength = query.Length; } } // Entry is a keeper if either source or target headword contains query if (hiliteLength != 0) { // Drop if there's any unprintable hanzi if (!areHanziCovered(entry)) { continue; } // TO-DO: indicate wrong script in result CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None, pos, entry.HanziPinyinMap, hiliteStart, hiliteLength); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } } return(resList); }
private List <ResWithEntry> retrieveVerifyPinyin(List <int> cands, List <PinyinSyllable> qsylls) { List <ResWithEntry> resList = new List <ResWithEntry>(); foreach (int blobId in cands) { // Load entry from DB CedictEntry entry = loadFromBlob(blobId); // Find query syllables in entry int syllStart = -1; for (int i = 0; i <= entry.PinyinCount - qsylls.Count; ++i) { int j; for (j = 0; j != qsylls.Count; ++j) { PinyinSyllable syllEntry = entry.GetPinyinAt(i + j); PinyinSyllable syllQuery = qsylls[j]; if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text) { break; } if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone) { break; } } if (j == qsylls.Count) { syllStart = i; break; } } // Entry is a keeper if query syllables found if (syllStart == -1) { continue; } // Keeper! CedictResult cres = new CedictResult(blobId, entry.HanziPinyinMap, syllStart, qsylls.Count); ResWithEntry resWE = new ResWithEntry(cres, entry); resList.Add(resWE); } return(resList); }
private static int pyComp(ResWithEntry a, ResWithEntry b) { // Shorter entry comes first int lengthCmp = a.Entry.PinyinCount.CompareTo(b.Entry.PinyinCount); if (lengthCmp != 0) { return(lengthCmp); } // Between equally long headwords where match starts sooner comes first int startCmp = a.Res.PinyinHiliteStart.CompareTo(b.Res.PinyinHiliteStart); if (startCmp != 0) { return(startCmp); } // Order equally long entries by pinyin lexicographical order return(a.Entry.PinyinCompare(b.Entry)); }
private List <ResWithEntry> retrieveVerifyHanzi(HashSet <int> cands, string query) { List <ResWithEntry> resList = new List <ResWithEntry>(); foreach (int blobId in cands) { // Load entry from DB CedictEntry entry = loadFromBlob(blobId); // Figure out position/length of query string in simplified and traditional headwords int hiliteStart = -1; int hiliteLength = 0; hiliteStart = entry.ChSimpl.IndexOf(query); if (hiliteStart != -1) { hiliteLength = query.Length; } // If not found in simplified, check in traditional if (hiliteLength == 0) { hiliteStart = entry.ChTrad.IndexOf(query); if (hiliteStart != -1) { hiliteLength = query.Length; } } // Entry is a keeper if either source or target headword contains query if (hiliteLength != 0) { CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None, blobId, entry.HanziPinyinMap, hiliteStart, hiliteLength); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } } return(resList); }
/// <summary> /// Retrieves pinyin lookup candidates, verifies actual presence of search expression in headword. /// </summary> List<ResWithEntry> doLoadVerifyPinyin(BinReader br, IEnumerable<int> poss, List<PinyinSyllable> sylls) { List<ResWithEntry> resList = new List<ResWithEntry>(); // Yes, we only open our file on-demand // But we do this within each lookup's scope, so lookup stays thread-safe // Look at each entry: load, verify, keep or drop foreach (int pos in poss) { // Load up entry from file br.Position = pos; CedictEntry entry = new CedictEntry(br); // Find query syllables in entry int syllStart = -1; for (int i = 0; i <= entry.PinyinCount - sylls.Count; ++i) { int j; for (j = 0; j != sylls.Count; ++j) { PinyinSyllable syllEntry = entry.GetPinyinAt(i + j); PinyinSyllable syllQuery = sylls[j]; if (syllEntry.Text.ToLowerInvariant() != syllQuery.Text) break; if (syllQuery.Tone != -1 && syllEntry.Tone != syllQuery.Tone) break; } if (j == sylls.Count) { syllStart = i; break; } } // Entry is a keeper if query syllables found if (syllStart == -1) continue; // Drop if there's any unprintable Hanzi if (!areHanziCovered(entry)) continue; // Keeper! CedictResult res = new CedictResult(pos, entry.HanziPinyinMap, syllStart, sylls.Count); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } return resList; }
/// <summary> /// Retrieves hanzi lookup candidates, verifies actual presence of search expression in headword. /// </summary> List<ResWithEntry> doLoadVerifyHanzi(BinReader br, IEnumerable<int> poss, string query, SearchScript script) { List<ResWithEntry> resList = new List<ResWithEntry>(); // Yes, we only open our file on-demand // But we do this within each lookup's scope, so lookup stays thread-safe // Look at each entry: load, verify, keep or drop foreach (int pos in poss) { // Load up entry from file br.Position = pos; CedictEntry entry = new CedictEntry(br); // Figure out position/length of query string in simplified and traditional headwords int hiliteStart = -1; int hiliteLength = 0; hiliteStart = entry.ChSimpl.IndexOf(query); if (hiliteStart != -1) hiliteLength = query.Length; // If not found in simplified, check in traditional if (hiliteLength == 0) { hiliteStart = entry.ChTrad.IndexOf(query); if (hiliteStart != -1) hiliteLength = query.Length; } // Entry is a keeper if either source or target headword contains query if (hiliteLength != 0) { // Drop if there's any unprintable hanzi if (!areHanziCovered(entry)) continue; // TO-DO: indicate wrong script in result CedictResult res = new CedictResult(CedictResult.SimpTradWarning.None, pos, entry.HanziPinyinMap, hiliteStart, hiliteLength); ResWithEntry resWE = new ResWithEntry(res, entry); resList.Add(resWE); } } return resList; }
/// <summary> /// Compares lookup results after pinyin lookup for sorted presentation. /// </summary> private static int pyComp(ResWithEntry a, ResWithEntry b) { // Shorter entry comes first int lengthCmp = a.Entry.PinyinCount.CompareTo(b.Entry.PinyinCount); if (lengthCmp != 0) return lengthCmp; // Between equally long headwords where match starts sooner comes first int startCmp = a.Res.PinyinHiliteStart.CompareTo(b.Res.PinyinHiliteStart); if (startCmp != 0) return startCmp; // Order equally long entries by pinyin lexicographical order return a.Entry.PinyinCompare(b.Entry); }