/// <summary> /// Ctor: init immutable instance. /// </summary> public CedictSense(HybridText domain, HybridText equiv, HybridText note) { Domain = domain; Equiv = equiv; Note = note; }
/// <summary> /// <para>Tokenize the sense's equiv, presented as hybrid text.</para> /// <para>During parsing, creates new word IDs as tokens come up.</para> /// </summary> public ReadOnlyCollection <EquivToken> Tokenize(HybridText txt) { List <EquivToken> res = new List <EquivToken>(); int runIX = -1; foreach (TextRun tr in txt.Runs) { ++runIX; if (tr is TextRunZho) { int idZho = wh.IdZho; EquivToken eqt = new EquivToken { TokenId = idZho, RunIx = runIX, StartInRun = 0, LengthInRun = 0, }; res.Add(eqt); continue; } string str = tr.GetPlainText(); tokenizeRun(str, runIX, res); } return(new ReadOnlyCollection <EquivToken>(res)); }
/// <summary> /// Converts a hybrid text to CEDICT-formatted plain text (marking up hanzi+pinyin sections). /// </summary> public static string HybridToCedict(HybridText ht) { StringBuilder sb = new StringBuilder(); bool first = true; for (int i = 0; i != ht.RunCount; ++i) { TextRun tr = ht.GetRunAt(i); if (tr is TextRunLatin) { string strRun = tr.GetPlainText(); if (!first && strRun != string.Empty && !char.IsPunctuation(strRun[0])) { sb.Append(' '); } sb.Append(strRun); } else { if (!first) { sb.Append(' '); } TextRunZho trz = tr as TextRunZho; if (!string.IsNullOrEmpty(trz.Simp)) { sb.Append(trz.Simp); } if (trz.Trad != trz.Simp && !string.IsNullOrEmpty(trz.Trad)) { sb.Append('|'); sb.Append(trz.Trad); } if (trz.Pinyin != null) { sb.Append('['); sb.Append(GetPinyinCedict(trz.Pinyin)); sb.Append(']'); } } first = false; } return(sb.ToString()); }
/// <summary> /// Returns true if display font covers all Hanzi in hybrid text; false otherwise. /// </summary> private bool areHanziCovered(HybridText ht) { if (ht.IsEmpty) { return(true); } for (int i = 0; i != ht.RunCount; ++i) { TextRun tr = ht.GetRunAt(i); TextRunZho trJoe = tr as TextRunZho; if (trJoe == null) { continue; } if (trJoe.Simp == null) { continue; } foreach (char c in trJoe.Simp) { if (!cvr.GetCoverage(c).HasFlag(FontCoverageFlags.Simp)) { return(false); } } if (trJoe.Trad == null) { continue; } foreach (char c in trJoe.Trad) { if (!cvr.GetCoverage(c).HasFlag(FontCoverageFlags.Trad)) { return(false); } } } return(true); }
/// <summary> /// Converts a hybrid text to HTML (marking up hanzi+pinyin sections). /// </summary> public static string HybridToHtml(HybridText ht, SearchScript script) { StringBuilder sb = new StringBuilder(); bool first = true; for (int i = 0; i != ht.RunCount; ++i) { TextRun tr = ht.GetRunAt(i); if (tr is TextRunLatin) { string strRun = tr.GetPlainText(); if (!first && strRun != string.Empty && !char.IsPunctuation(strRun[0])) sb.Append(' '); sb.Append(strRun); } else { if (!first) sb.Append(' '); TextRunZho trz = tr as TextRunZho; string hanzi1 = (script == SearchScript.Traditional) ? trz.Trad : trz.Simp; if (string.IsNullOrEmpty(hanzi1)) hanzi1 = null; string hanzi2 = null; if (hanzi1 != null && script == SearchScript.Both && !string.IsNullOrEmpty(trz.Trad)) hanzi2 = trz.Trad; if (hanzi1 != null) hanzi1 = escape(hanzi1); if (hanzi2 != null) hanzi2 = escape(hanzi2); if (hanzi1 != null || hanzi2 != null) sb.Append(templateSenseHanziOpen); if (hanzi1 != null) sb.Append(hanzi1); if (hanzi2 != null) { sb.Append(' '); sb.Append(templateBullet); sb.Append(' '); sb.Append(hanzi2); } if (hanzi1 != null || hanzi2 != null) sb.Append(templateSenseHanziClose); if (trz.Pinyin != null) { if (hanzi1 != null) sb.Append(' '); sb.Append('['); sb.Append(escape(trz.GetPinyinInOne(true))); sb.Append(']'); } } first = false; } return sb.ToString(); }
/// <summary> /// Breaks down body content into typographic blocks and caches the size of these. /// </summary> /// <param name="g">A Graphics object used for measurements.</param> private void doMeasureBlocks(Graphics g) { // Once measured, blocks don't change. Nothing to do then. if (measuredBlocks != null) return; // This is how we measure StringFormat sf = StringFormat.GenericTypographic; g.TextRenderingHint = System.Drawing.Text.TextRenderingHint.AntiAlias; // Decide about size of sense ID up front: that's always a square, letter-height SizeF xSize = g.MeasureString("x", getFont(fntSenseLatin), 65535, sf); ushort senseIdxWidth = (ushort)Math.Ceiling(xSize.Height); // Create array with as many items as senses // Each item is null, or highlight in sense's equiv CedictTargetHighlight[] hlArr = new CedictTargetHighlight[entry.SenseCount]; foreach (CedictTargetHighlight hl in res.TargetHilites) hlArr[hl.SenseIx] = hl; // Recreate list of blocks List<Block> newBlocks = new List<Block>(); // Collect links here. Will only keep at end if not empty. List<LinkArea> newLinks = new List<LinkArea>(); int senseIdx = -1; int displaySenseIdx = -1; bool lastWasClassifier = false; foreach (CedictSense cm in entry.Senses) { ++senseIdx; // Is this sense a classifier? bool classifier = cm.Domain.EqualsPlainText("CL:"); if (!classifier) ++displaySenseIdx; // Add one block for sense ID, unless this is a classifier "sense" if (!classifier) { Block sidBlock = new Block { Width = senseIdxWidth, StickRight = true, TextPos = textPool.PoolString(getSenseIdString(displaySenseIdx)), NewLine = lastWasClassifier, SenseId = true, FirstInCedictSense = true, }; newBlocks.Add(sidBlock); } // Split domain, equiv and note into typographic parts // Splits along spaces and dashes // Unpacks Chinese ranges // Domain is localized text for "Classifier:" if, well, this is a classifier sense int startIX = newBlocks.Count; if (!classifier) makeBlocks(cm.Domain, true, null, newBlocks, newLinks); else { string strClassifier = tprov.GetString("ResultCtrlClassifier"); HybridText htClassifier = new HybridText(strClassifier); int ix = newBlocks.Count; makeBlocks(htClassifier, true, null, newBlocks, newLinks); Block xb = newBlocks[ix]; xb.NewLine = true; newBlocks[ix] = xb; } makeBlocks(cm.Equiv, false, hlArr[senseIdx], newBlocks, newLinks); makeBlocks(cm.Note, true, null, newBlocks, newLinks); // If sense is a classifier, mark first block as sense starter if (classifier) { Block sstart = newBlocks[startIX]; sstart.FirstInCedictSense = true; newBlocks[startIX] = sstart; } // Measure each block for (int i = startIX; i != newBlocks.Count; ++i) { Block tb = newBlocks[i]; bool isHanzi = !(tb.FontIdx == fntMetaLatin || tb.FontIdx == fntSenseLatin); SizeF sz; if (!isHanzi) sz = g.MeasureString(textPool.GetString(tb.TextPos), getFont(tb.FontIdx), 65535, sf); else sz = HanziRenderer.MeasureString(g, Magic.ZhoContentFontFamily, textPool.GetString(tb.TextPos), Magic.LemmaHanziFontSize); tb.Width = (ushort)Math.Round(sz.Width); newBlocks[i] = tb; } lastWasClassifier = classifier; } if (newLinks.Count != 0) targetLinks = newLinks; measuredBlocks = newBlocks.ToArray(); }
/// <summary> /// <para>Produces unmeasured display blocks from a single hybrid text. Marks highlights, if any.</para> /// <para>Does not fill in blocks' size, but fills in everything else.</para> /// </summary> /// <param name="htxt">Hybrid text to break down into blocks and measure.</param> /// <param name="isMeta">True if this is a domain or note (displayed in italics).</param> /// <param name="hl">Highlight to show in hybrid text, or null.</param> /// <param name="blocks">List of blocks to append to.</param> /// <param name="links">List to gather links (appending to list).</param> private void makeBlocks(HybridText htxt, bool isMeta, CedictTargetHighlight hl, List<Block> blocks, List<LinkArea> links) { byte fntIdxLatin = isMeta ? fntMetaLatin : fntSenseLatin; byte fntIdxZhoSimp = isMeta ? fntMetaHanziSimp : fntSenseHanziSimp; byte fntIdxZhoTrad = isMeta ? fntMetaHanziTrad : fntSenseHanziTrad; // Go run by run for (int runIX = 0; runIX != htxt.RunCount; ++runIX) { TextRun run = htxt.GetRunAt(runIX); // Latin run: split by spaces first if (run is TextRunLatin) { string[] bySpaces = run.GetPlainText().Split(new char[] { ' ' }); // Each word: also by dash int latnPos = 0; foreach (string str in bySpaces) { string[] byDashes = splitByDash(str); // Add block for each int ofsPos = 0; foreach (string blockStr in byDashes) { Block tb = new Block { TextPos = textPool.PoolString(blockStr), FontIdx = fntIdxLatin, SpaceAfter = false, // will set this true for last block in "byDashes" }; // Does block's text intersect with highlight? if (hl != null && hl.RunIx == runIX) { int blockStart = latnPos + ofsPos; int blockEnd = blockStart + blockStr.Length; if (blockStart >= hl.HiliteStart && blockStart < hl.HiliteStart + hl.HiliteLength) tb.Hilite = true; else if (blockEnd > hl.HiliteStart && blockEnd <= hl.HiliteStart + hl.HiliteLength) tb.Hilite = true; else if (blockStart < hl.HiliteStart && blockEnd >= hl.HiliteStart + hl.HiliteLength) tb.Hilite = true; } blocks.Add(tb); // Keep track of position for highlight ofsPos += blockStr.Length; } // Make sure last one is followed by space Block xb = blocks[blocks.Count - 1]; xb.SpaceAfter = true; blocks[blocks.Count - 1] = xb; // Keep track of position in text - for highlights latnPos += str.Length + 1; } } // Chinese: depends on T/S/Both display mode, and on available info else { TextRunZho zhoRun = run as TextRunZho; // Chinese range is made up of: // Simplified (empty string if only traditional requested) // Separator (if both simplified and traditional are requested) // Traditional (empty string if only simplified requested) // Pinyin with accents as tone marks, in brackets (if present) string strSimp = string.Empty; if (analyzedScript != SearchScript.Traditional && zhoRun.Simp != null) strSimp = zhoRun.Simp; string strTrad = string.Empty; if (analyzedScript != SearchScript.Simplified && zhoRun.Trad != null) strTrad = zhoRun.Trad; string strPy = string.Empty; // Convert pinyin to display format (tone marks as diacritics; r5 glued) if (zhoRun.Pinyin != null) strPy = "[" + zhoRun.GetPinyinInOne(true) + "]"; // Create link area, with query string string strPyNumbers = string.Empty; // Pinyin with numbers as tone marks if (zhoRun.Pinyin != null) strPyNumbers = zhoRun.GetPinyinRaw(); LinkArea linkArea = new LinkArea(strSimp, strTrad, strPyNumbers, analyzedScript); // Block for simplified, if present if (strSimp != string.Empty) { Block tb = new Block { TextPos = textPool.PoolString(strSimp), FontIdx = fntIdxZhoSimp, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Separator if both simplified and traditional are there // AND they are different... if (strSimp != string.Empty && strTrad != string.Empty && strSimp != strTrad) { Block xb = blocks[blocks.Count - 1]; xb.StickRight = true; blocks[blocks.Count - 1] = xb; Block tb = new Block { TextPos = textPool.PoolString("•"), FontIdx = fntIdxLatin, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Traditional, if present if (strTrad != string.Empty && strTrad != strSimp) { Block tb = new Block { TextPos = textPool.PoolString(strTrad), FontIdx = fntIdxZhoTrad, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Pinyin, if present if (strPy != string.Empty) { // Split by spaces string[] pyParts = strPy.Split(new char[] { ' ' }); foreach (string pyPart in pyParts) { Block tb = new Block { TextPos = textPool.PoolString(pyPart), FontIdx = fntIdxLatin, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } } // Last part will have requested a space after. // Look ahead and if next text run is Latin and starts with punctuation, make it stick TextRunLatin nextLatinRun = null; if (runIX + 1 < htxt.RunCount) nextLatinRun = htxt.GetRunAt(runIX + 1) as TextRunLatin; if (nextLatinRun != null && char.IsPunctuation(nextLatinRun.GetPlainText()[0])) { Block xb = blocks[blocks.Count - 1]; xb.SpaceAfter = false; blocks[blocks.Count - 1] = xb; } // Collect link area links.Add(linkArea); } } }
/// <summary> /// <para>Produces unmeasured display blocks from a single hybrid text. Marks highlights, if any.</para> /// <para>Does not fill in blocks' size, but fills in everything else.</para> /// </summary> /// <param name="htxt">Hybrid text to break down into blocks and measure.</param> /// <param name="isMeta">True if this is a domain or note (displayed in italics).</param> /// <param name="hl">Highlight to show in hybrid text, or null.</param> /// <param name="blocks">List of blocks to append to.</param> /// <param name="links">List to gather links (appending to list).</param> private void makeBlocks(HybridText htxt, bool isMeta, CedictTargetHighlight hl, List <Block> blocks, List <LinkArea> links) { byte fntIdxLatin = isMeta ? fntMetaLatin : fntSenseLatin; byte fntIdxZhoSimp = isMeta ? fntMetaHanziSimp : fntSenseHanziSimp; byte fntIdxZhoTrad = isMeta ? fntMetaHanziTrad : fntSenseHanziTrad; // Go run by run for (int runIX = 0; runIX != htxt.RunCount; ++runIX) { TextRun run = htxt.GetRunAt(runIX); // Latin run: split by spaces first if (run is TextRunLatin) { string[] bySpaces = run.GetPlainText().Split(new char[] { ' ' }); // Each word: also by dash int latnPos = 0; foreach (string str in bySpaces) { string[] byDashes = splitByDash(str); // Add block for each int ofsPos = 0; foreach (string blockStr in byDashes) { Block tb = new Block { TextPos = textPool.PoolString(blockStr), FontIdx = fntIdxLatin, SpaceAfter = false, // will set this true for last block in "byDashes" }; // Does block's text intersect with highlight? if (hl != null && hl.RunIx == runIX) { int blockStart = latnPos + ofsPos; int blockEnd = blockStart + blockStr.Length; if (blockStart >= hl.HiliteStart && blockStart < hl.HiliteStart + hl.HiliteLength) { tb.Hilite = true; } else if (blockEnd > hl.HiliteStart && blockEnd <= hl.HiliteStart + hl.HiliteLength) { tb.Hilite = true; } else if (blockStart < hl.HiliteStart && blockEnd >= hl.HiliteStart + hl.HiliteLength) { tb.Hilite = true; } } blocks.Add(tb); // Keep track of position for highlight ofsPos += blockStr.Length; } // Make sure last one is followed by space Block xb = blocks[blocks.Count - 1]; xb.SpaceAfter = true; blocks[blocks.Count - 1] = xb; // Keep track of position in text - for highlights latnPos += str.Length + 1; } } // Chinese: depends on T/S/Both display mode, and on available info else { TextRunZho zhoRun = run as TextRunZho; // Chinese range is made up of: // Simplified (empty string if only traditional requested) // Separator (if both simplified and traditional are requested) // Traditional (empty string if only simplified requested) // Pinyin with accents as tone marks, in brackets (if present) string strSimp = string.Empty; if (analyzedScript != SearchScript.Traditional && zhoRun.Simp != null) { strSimp = zhoRun.Simp; } string strTrad = string.Empty; if (analyzedScript != SearchScript.Simplified && zhoRun.Trad != null) { strTrad = zhoRun.Trad; } string strPy = string.Empty; // Convert pinyin to display format (tone marks as diacritics; r5 glued) if (zhoRun.Pinyin != null) { strPy = "[" + zhoRun.GetPinyinInOne(true) + "]"; } // Create link area, with query string string strPyNumbers = string.Empty; // Pinyin with numbers as tone marks if (zhoRun.Pinyin != null) { strPyNumbers = zhoRun.GetPinyinRaw(); } LinkArea linkArea = new LinkArea(strSimp, strTrad, strPyNumbers, analyzedScript); // Block for simplified, if present if (strSimp != string.Empty) { Block tb = new Block { TextPos = textPool.PoolString(strSimp), FontIdx = fntIdxZhoSimp, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Separator if both simplified and traditional are there // AND they are different... if (strSimp != string.Empty && strTrad != string.Empty && strSimp != strTrad) { Block xb = blocks[blocks.Count - 1]; xb.StickRight = true; blocks[blocks.Count - 1] = xb; Block tb = new Block { TextPos = textPool.PoolString("•"), FontIdx = fntIdxLatin, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Traditional, if present if (strTrad != string.Empty && strTrad != strSimp) { Block tb = new Block { TextPos = textPool.PoolString(strTrad), FontIdx = fntIdxZhoTrad, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Pinyin, if present if (strPy != string.Empty) { // Split by spaces string[] pyParts = strPy.Split(new char[] { ' ' }); foreach (string pyPart in pyParts) { Block tb = new Block { TextPos = textPool.PoolString(pyPart), FontIdx = fntIdxLatin, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } } // Last part will have requested a space after. // Look ahead and if next text run is Latin and starts with punctuation, make it stick TextRunLatin nextLatinRun = null; if (runIX + 1 < htxt.RunCount) { nextLatinRun = htxt.GetRunAt(runIX + 1) as TextRunLatin; } if (nextLatinRun != null && char.IsPunctuation(nextLatinRun.GetPlainText()[0])) { Block xb = blocks[blocks.Count - 1]; xb.SpaceAfter = false; blocks[blocks.Count - 1] = xb; } // Collect link area links.Add(linkArea); } } }
/// <summary> /// Converts a hybrid text to HTML (marking up hanzi+pinyin sections). /// </summary> public static string HybridToHtml(HybridText ht, SearchScript script) { StringBuilder sb = new StringBuilder(); bool first = true; for (int i = 0; i != ht.RunCount; ++i) { TextRun tr = ht.GetRunAt(i); if (tr is TextRunLatin) { string strRun = tr.GetPlainText(); if (!first && strRun != string.Empty && !char.IsPunctuation(strRun[0])) { sb.Append(' '); } sb.Append(strRun); } else { if (!first) { sb.Append(' '); } TextRunZho trz = tr as TextRunZho; string hanzi1 = (script == SearchScript.Traditional) ? trz.Trad : trz.Simp; if (string.IsNullOrEmpty(hanzi1)) { hanzi1 = null; } string hanzi2 = null; if (hanzi1 != null && script == SearchScript.Both && !string.IsNullOrEmpty(trz.Trad)) { hanzi2 = trz.Trad; } if (hanzi1 != null) { hanzi1 = escape(hanzi1); } if (hanzi2 != null) { hanzi2 = escape(hanzi2); } if (hanzi1 != null || hanzi2 != null) { sb.Append(templateSenseHanziOpen); } if (hanzi1 != null) { sb.Append(hanzi1); } if (hanzi2 != null) { sb.Append(' '); sb.Append(templateBullet); sb.Append(' '); sb.Append(hanzi2); } if (hanzi1 != null || hanzi2 != null) { sb.Append(templateSenseHanziClose); } if (trz.Pinyin != null) { if (hanzi1 != null) { sb.Append(' '); } sb.Append('['); sb.Append(escape(trz.GetPinyinInOne(true))); sb.Append(']'); } } first = false; } return(sb.ToString()); }
/// <summary> /// Ctor: read from binary stream. /// </summary> public CedictSense(BinReader br) { Domain = HybridText.Deserialize(br); Equiv = HybridText.Deserialize(br); Note = HybridText.Deserialize(br); }
/// <summary> /// Retrieves matching entries for a target-language search expression. /// </summary> private List<CedictResult> doTargetLookup(BinReader br, string query) { // Empty query string: no results query = query.Trim(); if (query == string.Empty) return new List<CedictResult>(); // Tokenize query string HybridText txtQuery = new HybridText(query); ReadOnlyCollection<EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery); // Get query string's token IDs bool anyUnknown = false; HashSet<int> idSet = new HashSet<int>(); foreach (EquivToken eqt in txtTokenized) { if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho) { anyUnknown = true; break; } idSet.Add(eqt.TokenId); } // Any unknown tokens - no match, we know that immediately List<CedictResult> res = new List<CedictResult>(); if (anyUnknown) return res; // Collect IDs of tokenized senses that contain one or more of our query IDs Dictionary<int, SenseLookupInfo> senseTokenCounts = new Dictionary<int, SenseLookupInfo>(); bool firstToken = true; // For each token... foreach (int tokenId in idSet) { // Get sense instances where it occurs List<SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br); foreach (SenseInfo si in instances) { SenseLookupInfo sli; // We already have a count for this token ID if (senseTokenCounts.ContainsKey(si.TokenizedSenseId)) ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense; // Or this is the first time we're seeing it // We only record counts for the first token // We're looking for senses that contain *all* query tokens else if (firstToken) { sli = new SenseLookupInfo { NumOfQueryTokensInSense = 0, TokensInSense = si.TokensInSense }; senseTokenCounts[si.TokenizedSenseId] = sli; ++sli.NumOfQueryTokensInSense; } } firstToken = false; } // Keep those sense IDs (positions) that contain all of our query tokens // We already eliminated some candidates through "firstToken" trick before, but not all List<int> sensePosList = new List<int>(); foreach (var x in senseTokenCounts) { if (x.Value.NumOfQueryTokensInSense == idSet.Count) sensePosList.Add(x.Key); } // Load each tokenized sense to find out: // - whether entry is a real match // - entry ID // - best score for entry (multiple senses may hold query string) // - highlights Dictionary<int, EntryMatchInfo> entryIdToInfo = new Dictionary<int, EntryMatchInfo>(); foreach (int senseId in sensePosList) doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br); // Sort entry IDs by their best score // Drop entries with unprintable hanzi in HW now List<EntryMatchInfo> entryInfoList = new List<EntryMatchInfo>(); foreach (var x in entryIdToInfo) { // Check coverage. Because we don't load full entry, it's possible // that some unsupported chars in hybrid text of senses slip through. // There's a limit to perfectionism. string simp, trad; br.Position = x.Value.EntryId; CedictEntry.DeserializeHanzi(br, out simp, out trad); if (!areHanziCovered(simp, trad)) continue; // Queue up for sorting. entryInfoList.Add(x.Value); } entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore)); // Load entries, wrap into results foreach (EntryMatchInfo emi in entryInfoList) { CedictResult cr = new CedictResult(emi.EntryId, new ReadOnlyCollection<CedictTargetHighlight>(emi.TargetHilites)); res.Add(cr); } return res; }
/// <summary> /// Retrieves matching entries for a target-language search expression. /// </summary> private List <CedictResult> doTargetLookup(BinReader br, string query) { // Empty query string: no results query = query.Trim(); if (query == string.Empty) { return(new List <CedictResult>()); } // Tokenize query string HybridText txtQuery = new HybridText(query); ReadOnlyCollection <EquivToken> txtTokenized = tokenizer.Tokenize(txtQuery); // Get query string's token IDs bool anyUnknown = false; HashSet <int> idSet = new HashSet <int>(); foreach (EquivToken eqt in txtTokenized) { if (eqt.TokenId == WordHolder.IdUnknown || eqt.TokenId == index.WordHolder.IdZho) { anyUnknown = true; break; } idSet.Add(eqt.TokenId); } // Any unknown tokens - no match, we know that immediately List <CedictResult> res = new List <CedictResult>(); if (anyUnknown) { return(res); } // Collect IDs of tokenized senses that contain one or more of our query IDs Dictionary <int, SenseLookupInfo> senseTokenCounts = new Dictionary <int, SenseLookupInfo>(); bool firstToken = true; // For each token... foreach (int tokenId in idSet) { // Get sense instances where it occurs List <SenseInfo> instances = index.SenseIndex[tokenId].GetOrLoadInstances(br); foreach (SenseInfo si in instances) { SenseLookupInfo sli; // We already have a count for this token ID if (senseTokenCounts.ContainsKey(si.TokenizedSenseId)) { ++senseTokenCounts[si.TokenizedSenseId].NumOfQueryTokensInSense; } // Or this is the first time we're seeing it // We only record counts for the first token // We're looking for senses that contain *all* query tokens else if (firstToken) { sli = new SenseLookupInfo { NumOfQueryTokensInSense = 0, TokensInSense = si.TokensInSense }; senseTokenCounts[si.TokenizedSenseId] = sli; ++sli.NumOfQueryTokensInSense; } } firstToken = false; } // Keep those sense IDs (positions) that contain all of our query tokens // We already eliminated some candidates through "firstToken" trick before, but not all List <int> sensePosList = new List <int>(); foreach (var x in senseTokenCounts) { if (x.Value.NumOfQueryTokensInSense == idSet.Count) { sensePosList.Add(x.Key); } } // Load each tokenized sense to find out: // - whether entry is a real match // - entry ID // - best score for entry (multiple senses may hold query string) // - highlights Dictionary <int, EntryMatchInfo> entryIdToInfo = new Dictionary <int, EntryMatchInfo>(); foreach (int senseId in sensePosList) { doVerifyTarget(txtTokenized, senseId, entryIdToInfo, br); } // Sort entry IDs by their best score // Drop entries with unprintable hanzi in HW now List <EntryMatchInfo> entryInfoList = new List <EntryMatchInfo>(); foreach (var x in entryIdToInfo) { // Check coverage. Because we don't load full entry, it's possible // that some unsupported chars in hybrid text of senses slip through. // There's a limit to perfectionism. string simp, trad; br.Position = x.Value.EntryId; CedictEntry.DeserializeHanzi(br, out simp, out trad); if (!areHanziCovered(simp, trad)) { continue; } // Queue up for sorting. entryInfoList.Add(x.Value); } entryInfoList.Sort((a, b) => b.BestSenseScore.CompareTo(a.BestSenseScore)); // Load entries, wrap into results foreach (EntryMatchInfo emi in entryInfoList) { CedictResult cr = new CedictResult(emi.EntryId, new ReadOnlyCollection <CedictTargetHighlight>(emi.TargetHilites)); res.Add(cr); } return(res); }
/// <summary> /// Breaks down body content into typographic blocks and caches the size of these. /// </summary> /// <param name="g">A Graphics object used for measurements.</param> private void doMeasureBlocks(Graphics g) { // Once measured, blocks don't change. Nothing to do then. if (measuredBlocks != null) { return; } // This is how we measure StringFormat sf = StringFormat.GenericTypographic; g.TextRenderingHint = System.Drawing.Text.TextRenderingHint.AntiAlias; // Decide about size of sense ID up front: that's always a square, letter-height SizeF xSize = g.MeasureString("x", getFont(fntSenseLatin), 65535, sf); ushort senseIdxWidth = (ushort)Math.Ceiling(xSize.Height); // Create array with as many items as senses // Each item is null, or highlight in sense's equiv CedictTargetHighlight[] hlArr = new CedictTargetHighlight[entry.SenseCount]; foreach (CedictTargetHighlight hl in res.TargetHilites) { hlArr[hl.SenseIx] = hl; } // Recreate list of blocks List <Block> newBlocks = new List <Block>(); // Collect links here. Will only keep at end if not empty. List <LinkArea> newLinks = new List <LinkArea>(); int senseIdx = -1; int displaySenseIdx = -1; bool lastWasClassifier = false; foreach (CedictSense cm in entry.Senses) { ++senseIdx; // Is this sense a classifier? bool classifier = cm.Domain.EqualsPlainText("CL:"); if (!classifier) { ++displaySenseIdx; } // Add one block for sense ID, unless this is a classifier "sense" if (!classifier) { Block sidBlock = new Block { Width = senseIdxWidth, StickRight = true, TextPos = textPool.PoolString(getSenseIdString(displaySenseIdx)), NewLine = lastWasClassifier, SenseId = true, FirstInCedictSense = true, }; newBlocks.Add(sidBlock); } // Split domain, equiv and note into typographic parts // Splits along spaces and dashes // Unpacks Chinese ranges // Domain is localized text for "Classifier:" if, well, this is a classifier sense int startIX = newBlocks.Count; if (!classifier) { makeBlocks(cm.Domain, true, null, newBlocks, newLinks); } else { string strClassifier = tprov.GetString("ResultCtrlClassifier"); HybridText htClassifier = new HybridText(strClassifier); int ix = newBlocks.Count; makeBlocks(htClassifier, true, null, newBlocks, newLinks); Block xb = newBlocks[ix]; xb.NewLine = true; newBlocks[ix] = xb; } makeBlocks(cm.Equiv, false, hlArr[senseIdx], newBlocks, newLinks); makeBlocks(cm.Note, true, null, newBlocks, newLinks); // If sense is a classifier, mark first block as sense starter if (classifier) { Block sstart = newBlocks[startIX]; sstart.FirstInCedictSense = true; newBlocks[startIX] = sstart; } // Measure each block for (int i = startIX; i != newBlocks.Count; ++i) { Block tb = newBlocks[i]; bool isHanzi = !(tb.FontIdx == fntMetaLatin || tb.FontIdx == fntSenseLatin); SizeF sz; if (!isHanzi) { sz = g.MeasureString(textPool.GetString(tb.TextPos), getFont(tb.FontIdx), 65535, sf); } else { sz = HanziRenderer.MeasureString(g, Magic.ZhoContentFontFamily, textPool.GetString(tb.TextPos), Magic.LemmaHanziFontSize); } tb.Width = (ushort)Math.Round(sz.Width); newBlocks[i] = tb; } lastWasClassifier = classifier; } if (newLinks.Count != 0) { targetLinks = newLinks; } measuredBlocks = newBlocks.ToArray(); }
/// <summary> /// Converts a hybrid text to CEDICT-formatted plain text (marking up hanzi+pinyin sections). /// </summary> public static string HybridToCedict(HybridText ht) { StringBuilder sb = new StringBuilder(); bool first = true; for (int i = 0; i != ht.RunCount; ++i) { TextRun tr = ht.GetRunAt(i); if (tr is TextRunLatin) { string strRun = tr.GetPlainText(); if (!first && strRun != string.Empty && !char.IsPunctuation(strRun[0])) sb.Append(' '); sb.Append(strRun); } else { if (!first) sb.Append(' '); TextRunZho trz = tr as TextRunZho; if (!string.IsNullOrEmpty(trz.Simp)) sb.Append(trz.Simp); if (trz.Trad != trz.Simp && !string.IsNullOrEmpty(trz.Trad)) { sb.Append('|'); sb.Append(trz.Trad); } if (trz.Pinyin != null) { sb.Append('['); sb.Append(GetPinyinCedict(trz.Pinyin)); sb.Append(']'); } } first = false; } return sb.ToString(); }
/// <summary> /// Parses an entry (line) that has been separated into headword and rest. /// </summary> private static CedictEntry parseEntry(string strHead, string strBody, StreamWriter logStream, int lineNum) { // Decompose head Match hm = reHead.Match(strHead); if (!hm.Success) { string msg = "Line {0}: ERROR: Invalid header syntax: {1}"; msg = string.Format(msg, lineNum, strHead); if (logStream != null) { logStream.WriteLine(msg); } return(null); } // Split pinyin by spaces string[] pinyinParts = hm.Groups[3].Value.Split(new char[] { ' ' }); // Convert pinyin to our normalized format PinyinSyllable[] pinyinSylls; List <int> pinyinMap; normalizePinyin(pinyinParts, out pinyinSylls, out pinyinMap); // Weird syllables found > warning if (Array.FindIndex(pinyinSylls, x => x.Tone == -1) != -1) { string msg = "Line {0}: Warning: Weird pinyin syllable: {1}"; msg = string.Format(msg, lineNum, strHead); if (logStream != null) { logStream.WriteLine(msg); } } // Trad and simp MUST have same # of chars, always if (hm.Groups[1].Value.Length != hm.Groups[2].Value.Length) { string msg = "Line {0}: ERROR: Trad/simp char count mismatch: {1}"; msg = string.Format(msg, lineNum, strHead); if (logStream != null) { logStream.WriteLine(msg); } return(null); } // Transform map so it says, for each hanzi, which pinyin syllable it corresponds to // Some chars in hanzi may have no pinyin: when hanzi includes a non-ideagraphic character short[] hanziToPinyin = transformPinyinMap(hm.Groups[1].Value, pinyinMap); // Headword MUST have same number of ideo characters as non-weird pinyin syllables if (hanziToPinyin == null) { string msg = "Line {0}: Warning: Failed to match hanzi to pinyin: {1}"; msg = string.Format(msg, lineNum, strHead); if (logStream != null) { logStream.WriteLine(msg); } } // Split meanings by slash string[] meaningsRaw = strBody.Split(new char[] { '/' }); List <string> meanings = new List <string>(); foreach (string s in meaningsRaw) { if (s.Trim() != "") { meanings.Add(s.Trim()); } } if (meaningsRaw.Length != meanings.Count) { string msg = "Line {0}: Warning: Empty sense in entry: {1}"; msg = string.Format(msg, lineNum, strBody); if (logStream != null) { logStream.WriteLine(msg); } } // At least one meaning! if (meanings.Count == 0) { string msg = "Line {0}: ERROR: No sense: {1}"; msg = string.Format(msg, lineNum, strBody); if (logStream != null) { logStream.WriteLine(msg); } return(null); } // Separate domain, equiv and not in each sense List <CedictSense> cedictSenses = new List <CedictSense>(); foreach (string s in meanings) { string domain, equiv, note; trimSense(s, out domain, out equiv, out note); // Equiv is empty: merits at least a warning if (equiv == "") { string msg = "Line {0}: Warning: No equivalent in sense, only domain/notes: {1}"; msg = string.Format(msg, lineNum, s); if (logStream != null) { logStream.WriteLine(msg); } } // Convert all parts of sense to hybrid text HybridText hDomain = plainTextToHybrid(domain, lineNum, logStream); HybridText hEquiv = plainTextToHybrid(equiv, lineNum, logStream); HybridText hNote = plainTextToHybrid(note, lineNum, logStream); // Store new sense - unless we failed to parse anything properly if (hDomain != null && hEquiv != null && hNote != null) { cedictSenses.Add(new CedictSense(hDomain, hEquiv, hNote)); } } // If there are no senses, we failed. But that will have been logged before, so just return null. if (cedictSenses.Count == 0) { return(null); } // Done with entry CedictEntry res = new CedictEntry(hm.Groups[2].Value, hm.Groups[1].Value, new ReadOnlyCollection <PinyinSyllable>(pinyinSylls), new ReadOnlyCollection <CedictSense>(cedictSenses), hanziToPinyin); return(res); }
/// <summary> /// Returns true if display font covers all Hanzi in hybrid text; false otherwise. /// </summary> private bool areHanziCovered(HybridText ht) { if (ht.IsEmpty) return true; for (int i = 0; i != ht.RunCount; ++i) { TextRun tr = ht.GetRunAt(i); TextRunZho trJoe = tr as TextRunZho; if (trJoe == null) continue; if (trJoe.Simp == null) continue; foreach (char c in trJoe.Simp) if (!cvr.GetCoverage(c).HasFlag(FontCoverageFlags.Simp)) return false; if (trJoe.Trad == null) continue; foreach (char c in trJoe.Trad) if (!cvr.GetCoverage(c).HasFlag(FontCoverageFlags.Trad)) return false; } return true; }
private void renderEquiv(HtmlTextWriter writer, HybridText equiv, CedictTargetHighlight hl, bool nobr) { HybridTextConsumer htc = new HybridTextConsumer(equiv, hl); bool firstWordOver = false; bool hlOn = false; char c; bool inHL; while (true) { htc.GetNext(out c, out inHL); if (c == (char)0) { break; } // Highlight starts? if (inHL && !hlOn) { // Very first word gets special highlight if hilite goes beyond first space, and we're in nobr mode if (!firstWordOver && nobr && htc.IsNextSpaceInHilite()) { writer.AddAttribute(HtmlTextWriterAttribute.Class, "sense-hl-start"); writer.RenderBeginTag(HtmlTextWriterTag.Span); } // Plain old hilite start everywhere else else { writer.AddAttribute(HtmlTextWriterAttribute.Class, "sense-hl"); writer.RenderBeginTag(HtmlTextWriterTag.Span); } hlOn = true; } // Highlight ends? else if (!inHL && hlOn) { writer.RenderEndTag(); hlOn = false; } // Space - close "nobr" span if first word's just over if (c == ' ' && !firstWordOver && nobr) { firstWordOver = true; writer.RenderEndTag(); if (hlOn) { writer.RenderEndTag(); writer.AddAttribute(HtmlTextWriterAttribute.Class, "sense-hl-end"); writer.RenderBeginTag(HtmlTextWriterTag.Span); } } // Render character writer.WriteEncodedText(c.ToString()); } // Close hilite and nobr that we may have open if (!firstWordOver && nobr) { writer.RenderEndTag(); } if (hlOn) { writer.RenderEndTag(); } }
public HybridTextConsumer(HybridText txt, CedictTargetHighlight hl) { this.txt = txt; this.hl = hl; runTxt = txt.GetRunAt(0).GetPlainText(); }