/// <summary> /// <para>Parses embedded Chinese ranges in string to create hybrid text with mixed runs.</para> /// </summary> /// <param name="str">String to parse. Can be null or empty.</param> /// <param name="lineNum">Line number in input file (to log errors/warnings).</param> /// <param name="logStream">Log stream (to log errors/warnings).</param> /// <returns>The input's hybrid text representation, or null if we failed to parse.</returns> private static HybridText plainTextToHybrid(string str, int lineNum, StreamWriter logStream) { if (string.IsNullOrEmpty(str)) { return(HybridText.Empty); } // Start with a single run, assumed to be pure Latin List <TextRun> runs = new List <TextRun>(); runs.Add(new TextRunLatin(str)); // Go over each run and replace it with one or more runs, as per result of inreasingly more narrow matchers runs = splitRuns(runs, reSTP, new MatchTranslatorSTP()); runs = splitRuns(runs, reSP, new MatchTranslatorSP()); runs = splitRuns(runs, reST, new MatchTranslatorST()); runs = splitRuns(runs, reS, new MatchTranslatorS()); runs = splitRuns(runs, reP, new MatchTranslatorP()); // Sanity checks foreach (TextRun tr in runs) { TextRunLatin trl = tr as TextRunLatin; if (trl == null) { continue; } // Must not have | or hanzi in Latin string rtext = trl.GetPlainText(); if (rtext.Contains("|") || hasIdeo(rtext)) { string msg = "Line {0}: ERROR: Failed to convert sense to hybrid text: {1}"; msg = string.Format(msg, lineNum, str); if (logStream != null) { logStream.WriteLine(msg); } return(null); } } // Done. return(new HybridText(new ReadOnlyCollection <TextRun>(runs))); }
/// <summary> /// Extract normalized tokens from Latin text run; add each to set. /// </summary> private void getTokens(TextRunLatin tr, HashSet <string> tokens) { string str = tr.GetPlainText(); string[] parts = str.Split(new char[] { ' ', '-' }); foreach (string wd in parts) { string x = wd.Trim(trimPuncChars); if (x == string.Empty) { continue; } if (reNumbers.IsMatch(x)) { x = "*num*"; } x = x.ToLowerInvariant(); tokens.Add(x); } }
/// <summary> /// Extract normalized tokens from Latin text run; add each to set. /// </summary> private void getTokens(TextRunLatin tr, HashSet<string> tokens) { string str = tr.GetPlainText(); string[] parts = str.Split(new char[] { ' ', '-' }); foreach (string wd in parts) { string x = wd.Trim(trimPuncChars); if (x == string.Empty) continue; if (reNumbers.IsMatch(x)) x = "*num*"; x = x.ToLowerInvariant(); tokens.Add(x); } }
/// <summary> /// <para>Produces unmeasured display blocks from a single hybrid text. Marks highlights, if any.</para> /// <para>Does not fill in blocks' size, but fills in everything else.</para> /// </summary> /// <param name="htxt">Hybrid text to break down into blocks and measure.</param> /// <param name="isMeta">True if this is a domain or note (displayed in italics).</param> /// <param name="hl">Highlight to show in hybrid text, or null.</param> /// <param name="blocks">List of blocks to append to.</param> /// <param name="links">List to gather links (appending to list).</param> private void makeBlocks(HybridText htxt, bool isMeta, CedictTargetHighlight hl, List <Block> blocks, List <LinkArea> links) { byte fntIdxLatin = isMeta ? fntMetaLatin : fntSenseLatin; byte fntIdxZhoSimp = isMeta ? fntMetaHanziSimp : fntSenseHanziSimp; byte fntIdxZhoTrad = isMeta ? fntMetaHanziTrad : fntSenseHanziTrad; // Go run by run for (int runIX = 0; runIX != htxt.RunCount; ++runIX) { TextRun run = htxt.GetRunAt(runIX); // Latin run: split by spaces first if (run is TextRunLatin) { string[] bySpaces = run.GetPlainText().Split(new char[] { ' ' }); // Each word: also by dash int latnPos = 0; foreach (string str in bySpaces) { string[] byDashes = splitByDash(str); // Add block for each int ofsPos = 0; foreach (string blockStr in byDashes) { Block tb = new Block { TextPos = textPool.PoolString(blockStr), FontIdx = fntIdxLatin, SpaceAfter = false, // will set this true for last block in "byDashes" }; // Does block's text intersect with highlight? if (hl != null && hl.RunIx == runIX) { int blockStart = latnPos + ofsPos; int blockEnd = blockStart + blockStr.Length; if (blockStart >= hl.HiliteStart && blockStart < hl.HiliteStart + hl.HiliteLength) { tb.Hilite = true; } else if (blockEnd > hl.HiliteStart && blockEnd <= hl.HiliteStart + hl.HiliteLength) { tb.Hilite = true; } else if (blockStart < hl.HiliteStart && blockEnd >= hl.HiliteStart + hl.HiliteLength) { tb.Hilite = true; } } blocks.Add(tb); // Keep track of position for highlight ofsPos += blockStr.Length; } // Make sure last one is followed by space Block xb = blocks[blocks.Count - 1]; xb.SpaceAfter = true; blocks[blocks.Count - 1] = xb; // Keep track of position in text - for highlights latnPos += str.Length + 1; } } // Chinese: depends on T/S/Both display mode, and on available info else { TextRunZho zhoRun = run as TextRunZho; // Chinese range is made up of: // Simplified (empty string if only traditional requested) // Separator (if both simplified and traditional are requested) // Traditional (empty string if only simplified requested) // Pinyin with accents as tone marks, in brackets (if present) string strSimp = string.Empty; if (analyzedScript != SearchScript.Traditional && zhoRun.Simp != null) { strSimp = zhoRun.Simp; } string strTrad = string.Empty; if (analyzedScript != SearchScript.Simplified && zhoRun.Trad != null) { strTrad = zhoRun.Trad; } string strPy = string.Empty; // Convert pinyin to display format (tone marks as diacritics; r5 glued) if (zhoRun.Pinyin != null) { strPy = "[" + zhoRun.GetPinyinInOne(true) + "]"; } // Create link area, with query string string strPyNumbers = string.Empty; // Pinyin with numbers as tone marks if (zhoRun.Pinyin != null) { strPyNumbers = zhoRun.GetPinyinRaw(); } LinkArea linkArea = new LinkArea(strSimp, strTrad, strPyNumbers, analyzedScript); // Block for simplified, if present if (strSimp != string.Empty) { Block tb = new Block { TextPos = textPool.PoolString(strSimp), FontIdx = fntIdxZhoSimp, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Separator if both simplified and traditional are there // AND they are different... if (strSimp != string.Empty && strTrad != string.Empty && strSimp != strTrad) { Block xb = blocks[blocks.Count - 1]; xb.StickRight = true; blocks[blocks.Count - 1] = xb; Block tb = new Block { TextPos = textPool.PoolString("•"), FontIdx = fntIdxLatin, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Traditional, if present if (strTrad != string.Empty && strTrad != strSimp) { Block tb = new Block { TextPos = textPool.PoolString(strTrad), FontIdx = fntIdxZhoTrad, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } // Pinyin, if present if (strPy != string.Empty) { // Split by spaces string[] pyParts = strPy.Split(new char[] { ' ' }); foreach (string pyPart in pyParts) { Block tb = new Block { TextPos = textPool.PoolString(pyPart), FontIdx = fntIdxLatin, SpaceAfter = true, }; blocks.Add(tb); linkArea.BlockIds.Add(blocks.Count - 1); } } // Last part will have requested a space after. // Look ahead and if next text run is Latin and starts with punctuation, make it stick TextRunLatin nextLatinRun = null; if (runIX + 1 < htxt.RunCount) { nextLatinRun = htxt.GetRunAt(runIX + 1) as TextRunLatin; } if (nextLatinRun != null && char.IsPunctuation(nextLatinRun.GetPlainText()[0])) { Block xb = blocks[blocks.Count - 1]; xb.SpaceAfter = false; blocks[blocks.Count - 1] = xb; } // Collect link area links.Add(linkArea); } } }