/// ------------------------------------------------------------------------------------ /// <summary> /// Member AddUnicodeProp /// </summary> /// <param name="tag">tag</param> /// <param name="ws">ws</param> /// <param name="_vwvc">_vwvc</param> /// ------------------------------------------------------------------------------------ public override void AddUnicodeProp(int tag, int ws, IVwViewConstructor _vwvc) { CurrentContext ccOld = WriteFieldStartTag(tag); string sText = DataAccess.get_UnicodeProp(CurrentObject(), tag); // Need to ensure that sText is NFC for export. Icu.InitIcuDataDir(); if (!Icu.IsNormalized(sText, Icu.UNormalizationMode.UNORM_NFC)) { sText = Icu.Normalize(sText, Icu.UNormalizationMode.UNORM_NFC); } string sWs = WritingSystemId(ws); IndentLine(); if (String.IsNullOrEmpty(sWs)) { m_writer.WriteLine("<Uni>{0}</Uni>", XmlUtils.MakeSafeXml(sText)); } else { m_writer.WriteLine("<AUni ws=\"{0}\">{1}</AUni>", sWs, XmlUtils.MakeSafeXml(sText)); } WriteFieldEndTag(tag, ccOld); }
public void Normalize_NFC2NFC() { var normalizedString = Icu.Normalize("tést", Icu.UNormalizationMode.UNORM_NFC); Assert.AreEqual("tést", normalizedString); Assert.IsTrue(normalizedString.IsNormalized(NormalizationForm.FormC)); }
private static void InitDictionary(string dicPath, IEnumerable <string> words) { var affixFile = Path.ChangeExtension(dicPath, ".aff"); if (!File.Exists(affixFile)) { using (var writer = FileUtils.OpenFileForWrite(affixFile, Encoding.UTF8)) { writer.WriteLine("SET UTF-8"); // Enhance JohnT: may be helpful to write TRY followed by the word-forming and possibly punctuation // characters of the language. This somehow affects the suggestions, but I haven't figured out how yet. writer.WriteLine("KEEPCASE " + keepCaseFlag); } } // If it already exists, probably we disabled it by deleting the .aff file--an approach we // no longer use; re-creating it should reinstate it. using (var writer = FileUtils.OpenFileForWrite(dicPath, Encoding.UTF8)) { // This is a size of hash table to allocate, NOT the exact number of words in the dictionary. // In particular it must NOT be zero or Hunspell will malfunction (divide by zero). // However, making it equal the number of words helps Hunspell allocate a good size of hashtable. writer.WriteLine(Math.Max(10, words.Count()).ToString()); writer.WriteLine(PrototypeWord + "/" + keepCaseFlag); foreach (var word in words) { writer.WriteLine(Icu.Normalize(word, Icu.UNormalizationMode.UNORM_NFC)); } } }
public bool UpdateWordform(IWfiWordform wordform, ParserPriority priority) { CheckDisposed(); int wordformHash = 0; ITsString form = null; int hvo = 0; using (new WorkerThreadReadHelper(m_cache.ServiceLocator.GetInstance <IWorkerThreadReadHandler>())) { if (wordform.IsValidObject) { wordformHash = wordform.Checksum; form = wordform.Form.VernacularDefaultWritingSystem; } } // 'form' will now be null, if it could not find the wordform for whatever reason. // uiCRCWordform will also now be 0, if 'form' is null. if (form == null || string.IsNullOrEmpty(form.Text)) { return(false); } CheckNeedsUpdate(); ParseResult result = m_parser.ParseWord(Icu.Normalize(form.Text.Replace(' ', '.'), Icu.UNormalizationMode.UNORM_NFD)); if (wordformHash == result.GetHashCode()) { return(false); } return(m_parseFiler.ProcessParse(wordform, priority, result)); }
private string GetOneWordformResult(int hvoWordform, string form) { Debug.Assert(hvoWordform > 0, "Wordform ID must be greater than zero."); Debug.Assert(form != null, "Wordform form must not be null."); try { Trace.WriteLineIf(tracingSwitch.TraceInfo, "GetOneWordformResult(): CurrentThreadId = " + Win32.GetCurrentThreadId().ToString()); DateTime startTime = DateTime.Now; //Debug.WriteLine("Begin parsing wordform " + form); string results = ParseWord(Icu.Normalize(form, Icu.UNormalizationMode.UNORM_NFD), hvoWordform); //Debug.WriteLine("After parsing wordform " + form); long ttlTicks = DateTime.Now.Ticks - startTime.Ticks; m_ticksParser += ttlTicks; DebugMsg("ParseWord(" + form + ") took : " + ttlTicks.ToString()); return(Icu.Normalize(results, Icu.UNormalizationMode.UNORM_NFD)); } catch (Exception error) { Trace.WriteLineIf(tracingSwitch.TraceError, "The word '" + form + "', id='" + hvoWordform.ToString() + "' failed to parse. error was: " + error.Message); //might as well keep going. //TODO: create an problem object since we could not parse this word. throw new ApplicationException("Error while parsing '" + form + "'.", error); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Handles creating simple TsStrings that contain one run with only a writing system. /// </summary> /// <param name="rootXml">The element including the Str or AStr tag</param> /// <param name="lgwsf">The writing system factory.</param> /// <returns>The created TsString or null if the XML was too complext to be handled /// by this method.</returns> /// ------------------------------------------------------------------------------------ private static ITsString HandleSimpleString(XElement rootXml, ILgWritingSystemFactory lgwsf) { if (rootXml.Elements().Count() != 1) { return(null); } XElement textElement = rootXml.Elements().First(); if (textElement.Name.LocalName != "Run") { return(null); // probably an error, anyway not simple case we are optimizing. } int cTextElementAtribs = textElement.Attributes().Count(); if (cTextElementAtribs != 1) { return(null); // Way too complex for this simple case } XAttribute wsAttribute = textElement.Attributes().First(); if (wsAttribute.Name.LocalName != "ws") { return(null); // we handle only single runs with only the ws attribute. } // Make sure the text is in the decomposed form (FWR-148) string runText = Icu.Normalize(textElement.Value, Icu.UNormalizationMode.UNORM_NFD); return(TsStringUtils.MakeString(runText, GetWsForId(wsAttribute.Value, lgwsf))); }
private void WriteLetterHeadIfNeeded(string sEntry, string sWs) { string sLower = GetLeadChar(Icu.Normalize(sEntry, Icu.UNormalizationMode.UNORM_NFD), sWs); string sTitle = Icu.ToTitle(sLower, sWs); if (sTitle != m_schCurrent) { if (m_schCurrent.Length > 0) { m_writer.WriteLine("</div>"); // for letData } m_writer.WriteLine("<div class=\"letHead\">"); StringBuilder sb = new StringBuilder(); if (!String.IsNullOrEmpty(sTitle) && sTitle != sLower) { sb.Append(sTitle.Normalize()); sb.Append(' '); } if (!String.IsNullOrEmpty(sLower)) { sb.Append(sLower.Normalize()); } m_writer.WriteLine("<div class=\"letter\">{0}</div>", XmlUtils.MakeSafeXml(sb.ToString())); m_writer.WriteLine("</div>"); m_writer.WriteLine("<div class=\"letData\">"); m_schCurrent = sTitle; } }
public void Normalize_NFD2NFD() { var normalizedString = Icu.Normalize("te\u0301st", Icu.UNormalizationMode.UNORM_NFD); Assert.AreEqual("te\u0301st", normalizedString); Assert.IsTrue(normalizedString.IsNormalized(NormalizationForm.FormD)); }
/// <summary> /// Try parsing a wordform, optionally getting a trace of the parse /// </summary> /// <param name="sForm">the word form to parse</param> /// <param name="fDoTrace">whether or not to trace the parse</param> /// <param name="sSelectTraceMorphs">list of msa hvos to limit trace to </param> public void TryAWord(string sForm, bool fDoTrace, string sSelectTraceMorphs) { CheckDisposed(); if (sForm == null) { throw new ArgumentNullException("sForm", "TryAWord cannot trace a Null string."); } if (sForm == String.Empty) { throw new ArgumentException("Can't try a word with no content.", "sForm"); } CheckNeedsUpdate(); using (var task = new TaskReport(string.Format(ParserCoreStrings.ksTraceWordformX, sForm), m_taskUpdateHandler)) { var normForm = Icu.Normalize(sForm, Icu.UNormalizationMode.UNORM_NFD); var result = fDoTrace ? TraceWord(normForm, sSelectTraceMorphs) : ParseWord(normForm, 0); if (fDoTrace) { task.Details = result; } else { task.Details = Icu.Normalize(result, Icu.UNormalizationMode.UNORM_NFD); } } }
public void SetStatus(string word1, bool isCorrect) { var word = Icu.Normalize(word1, Icu.UNormalizationMode.UNORM_NFC); if (Check(word) == isCorrect) { return; // nothing to do. } // Review: any IO exceptions we should handle? How?? SetInternalStatus(word, isCorrect); var builder = new StringBuilder(); bool insertedLineForWord = false; if (File.Exists(ExceptionPath)) { using (var reader = new StreamReader(ExceptionPath, Encoding.UTF8)) { string line; while ((line = reader.ReadLine()) != null) { var item = line; bool correct = true; if (item.Length > 0 && item[0] == '*') { correct = false; item = item.Substring(1); } // If we already got it, or the current line is before the word, just copy the line to the output. if (insertedLineForWord || String.Compare(item, word, System.StringComparison.Ordinal) < 0) { builder.AppendLine(line); continue; } // We've come to the right place to insert our word. if (!isCorrect) { builder.Append("*"); } builder.AppendLine(word); insertedLineForWord = true; if (word != item) // then current line must be a pre-existing word that comes after ours. { builder.AppendLine(line); // so add it in after item } } } } if (!insertedLineForWord) // no input file, or the word comes after any existing one { // The very first exception! if (!isCorrect) { builder.Append("*"); } builder.AppendLine(word); } // Write the new file over the old one. File.WriteAllText(ExceptionPath, builder.ToString(), Encoding.UTF8); }
/// <summary> /// Load plain C# string. Interpreset XML entities as the appropriate characters. /// </summary> internal static string LoadUnicodeString(XElement reader) { if (reader == null) { throw new ArgumentNullException("reader"); } return(Icu.Normalize(reader.Element("Uni").Value, Icu.UNormalizationMode.UNORM_NFD)); // return NFD. }
/// ------------------------------------------------------------------------------------ /// <summary> /// Handles a complex string that contains multiple runs with optional multiple /// text props applied. /// </summary> /// <param name="xml">The XML.</param> /// <param name="lgwsf">The writing system factory.</param> /// <returns>The created TsString</returns> /// ------------------------------------------------------------------------------------ private static ITsString HandleComplexString(XElement xml, ILgWritingSystemFactory lgwsf) { var runs = xml.Elements("Run"); if (runs.Count() == 0) { if (xml.Name.LocalName == "AStr" && xml.Attributes().Count() == 1) { // This duplicates a little bit of code from HandleSimpleRun, but I wanted to keep that really simple // and fast, and this case hardly ever happens...maybe not at all in real life. XAttribute wsAttribute = xml.Attributes().First(); if (wsAttribute.Name.LocalName != "ws") { return(null); // we handle only single runs with only the ws attribute. } // Make sure the text is in the decomposed form (FWR-148) string runText = Icu.Normalize(xml.Value, Icu.UNormalizationMode.UNORM_NFD); return(TsStringUtils.MakeString(runText, GetWsForId(wsAttribute.Value, lgwsf))); } return(null); // If we don't have any runs, we don't have a string! } var strBldr = TsStringUtils.MakeIncStrBldr(); foreach (XElement runElement in runs) { if (runElement == null) { throw new XmlSchemaException("TsString XML must contain a <Run> element contained in a <" + xml.Name.LocalName + "> element"); } string runText = runElement.Value; if (runElement.Attribute("ws") == null && (runText.Length == 0 || runText[0] > 13)) { throw new XmlSchemaException("Run element must contain a ws attribute. Run text: " + runElement.Value); } // Make sure the text is in the decomposed form (FWR-148) runText = Icu.Normalize(runText, Icu.UNormalizationMode.UNORM_NFD); bool isOrcNeeded = TsPropsSerializer.GetPropAttributesForElement(runElement, lgwsf, strBldr); // Add an ORC character, if needed, for the run if (runText.Length == 0 && isOrcNeeded) { runText = StringUtils.kszObject; } // Add the text with the properties to the builder strBldr.Append(runText); } return(strBldr.GetString()); }
public void Normalize_NFC2NFD() { var normalizedString = Icu.Normalize("tést", Icu.UNormalizationMode.UNORM_NFD); var i = 0; foreach (var c in normalizedString.ToCharArray()) { Console.WriteLine("pos {0}: {1} ({1:x})", i++, c); } Assert.AreEqual(0x0301, normalizedString[2]); Assert.AreEqual("te\u0301st", normalizedString); Assert.IsTrue(normalizedString.IsNormalized(NormalizationForm.FormD)); }
/// <summary> /// Write the xml for the unicode alternative. /// </summary> /// <param name="writer"></param> /// <param name="wsf"></param> /// <param name="ws"></param> /// <param name="alternative"></param> protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int ws, ITsString alternative) { var text = alternative.Text; if (string.IsNullOrEmpty(text)) { return; // Skip writing TsStrings with no content. } writer.WriteStartElement("AUni"); writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id); text = Icu.Normalize(text, Icu.UNormalizationMode.UNORM_NFC); writer.WriteString(text); writer.WriteEndElement(); }
private string GetOneWordformResult(int hvoWordform, string form) { Debug.Assert(hvoWordform > 0, "Wordform ID must be greater than zero."); Debug.Assert(form != null, "Wordform form must not be null."); Trace.WriteLineIf(m_tracingSwitch.TraceInfo, "GetOneWordformResult(): CurrentThreadId = " + Win32.GetCurrentThreadId()); var startTime = DateTime.Now; var results = ParseWord(Icu.Normalize(form, Icu.UNormalizationMode.UNORM_NFD), hvoWordform); long ttlTicks = DateTime.Now.Ticks - startTime.Ticks; m_ticksParser += ttlTicks; m_numberOfWordForms++; Trace.WriteLineIf(m_tracingSwitch.TraceInfo, "ParseWord(" + form + ") took : " + ttlTicks); return(Icu.Normalize(results, Icu.UNormalizationMode.UNORM_NFD)); }
/// <summary> /// Try parsing a wordform, optionally getting a trace of the parse /// </summary> /// <param name="sForm">the word form to parse</param> /// <param name="fDoTrace">whether or not to trace the parse</param> /// <param name="sSelectTraceMorphs">list of msa hvos to limit trace to </param> internal void TryAWord(string sForm, bool fDoTrace, string sSelectTraceMorphs) { CheckDisposed(); if (sForm == null) { throw new ArgumentNullException("sForm", "TryAWord cannot trace a Null string."); } if (sForm == String.Empty) { throw new ArgumentException("Can't try a word with no content.", "sForm"); } using (TaskReport task = new TaskReport( String.Format(ParserCoreStrings.ksTraceWordformX, sForm), m_taskUpdateHandler)) { try { string normForm = Icu.Normalize(sForm, Icu.UNormalizationMode.UNORM_NFD); string result = null; if (fDoTrace) { //Debug.WriteLine("Begin tracing wordform " + sForm); result = TraceWord(normForm, sSelectTraceMorphs); //Debug.WriteLine("After tacing wordform " + sForm); //Debug.WriteLine("Result of trace: " + task.Details); } else { result = ParseWord(normForm, 0); } task.Details = Icu.Normalize(result, Icu.UNormalizationMode.UNORM_NFD); return; } catch (Exception error) { Trace.WriteLineIf(tracingSwitch.TraceError, "The word '" + sForm + "' failed to parse. error was: " + error.Message); task.EncounteredError(null); // Don't want to show message box in addition to yellow crash box! //might as well keep going. //TODO: create an problem object since we could not parse this word. throw new ApplicationException("Error while parsing '" + sForm + "'.", error); } } }
/// <summary> /// Write ordinary C# string. Angle brackets and ampersand are converted to XML entities. /// </summary> internal static void WriteUnicodeString(XmlWriter writer, string elementName, string propertyData) { if (writer == null) { throw new ArgumentNullException("writer"); } if (string.IsNullOrEmpty(elementName)) { throw new ArgumentNullException("elementName"); } if (string.IsNullOrEmpty(propertyData)) { return; } writer.WriteStartElement(elementName); // Open prop. element. writer.WriteStartElement("Uni"); // Open Uni element. writer.WriteString(Icu.Normalize(propertyData, Icu.UNormalizationMode.UNORM_NFC)); // Store NFC. writer.WriteEndElement(); // Close Uni element. writer.WriteEndElement(); // Close prop. element. }
static internal int ReadMultiUnicodeAlternative(XElement aUniNode, ILgWritingSystemFactory wsf, ITsStrFactory tsf, out ITsString tss) { tss = null; var sValue = aUniNode.Value; if (String.IsNullOrEmpty(sValue)) { return(0); } var wsVal = aUniNode.Attribute("ws"); if (wsVal == null || String.IsNullOrEmpty(wsVal.Value)) { return(0); } var wsHvo = wsf.GetWsFromStr(wsVal.Value); // Throwing out a string without a ws is probably better than crashing // and preventing a db from being opened. // This code currently accepts this data, only storing en__IPA and fr strings. // <Form> // <AUni ws="en" /> // <AUni ws="en__IPA">problematic</AUni> // <AUni>missing</AUni> // <AUni></AUni> // <AUni ws="fr">french</AUni> // <AUni/> // </Form> if (wsHvo == 0) { return(0); } var text = Icu.Normalize(sValue, Icu.UNormalizationMode.UNORM_NFD); tss = tsf.MakeString(text, wsHvo); return(wsHvo); }
/// <summary> /// Handle the CommitText event. Transfer to GUI thread if neccessary. /// </summary> private void CommitTextEventHandler(string text) { if (AssociatedSimpleRootSite.InvokeRequired) { AssociatedSimpleRootSite.SafeBeginInvoke( new CommitDelegate(CommitTextEventHandler), new object[] { text }); return; } IActionHandler actionHandler = AssociatedSimpleRootSite.DataAccess.GetActionHandler(); try { if (actionHandler != null) { actionHandler.BeginUndoTask(Resources.ksUndoTyping, Resources.ksRedoTyping); } // Save existing Preedit Selection and existing left-over preedit string. var preeditSelection = SavePreeditSelection(); ITsString preedit; preeditSelection.Selection.GetSelectionString(out preedit, String.Empty); // Change selection to a insertion point (unless we moved the selection before, // which happens when we come here as part of processing a mouse click) // And insert commit text. var selHelper = new SelectionHelper( m_savedPreeditSelection ?? AssociatedSimpleRootSite.EditingHelper.CurrentSelection); if (m_savedPreeditSelection == null) { selHelper.ReduceToIp(SelectionHelper.SelLimitType.Anchor); } selHelper.SetSelection(true); AssociatedSimpleRootSite.EditingHelper.OnCharAux(text, VwShiftStatus.kfssNone, Keys.None); int deletedChars = TrimBeginningBackspaces(ref text); // Update the saved preedit selection to take account of the inserted text // text is in NFC, but the view uses it in NFD, so we have to convert it. // We don't do this if we moved the selection prior to this method. int textLenNFD = m_savedPreeditSelection != null ? 0 : Icu.Normalize(text, Icu.UNormalizationMode.UNORM_NFD).Length; int anchor = preeditSelection.IchAnchor + textLenNFD - deletedChars; // select the text we just inserted // TODO: underline the text so that it is more obvious that this is just preedit text preeditSelection.SetIch(SelectionHelper.SelLimitType.Anchor, anchor); preeditSelection.SetIch(SelectionHelper.SelLimitType.End, preeditSelection.IchEnd + textLenNFD - deletedChars); // reshow the preedit selection RestorePreeditSelection(preeditSelection); preeditSelection.Selection.ReplaceWithTsString(preedit); preeditSelection.SetSelection(true); } finally { m_savedPreeditSelection = null; if (actionHandler != null) { actionHandler.EndUndoTask(); } } }
// Implementation of both get_NormalizedForm and NfdAndFixOffsets private ITsString get_NormalizedFormAndFixOffsets(FwNormalizationMode nm, ArrayPtr oldOffsetsToFix, int numOffsetsToFix) { // Can we skip unnecessary work? if (IsAlreadyNormalized(nm)) { return(this); } if (string.IsNullOrEmpty(Text)) { NoteAlreadyNormalized(nm); return(this); } if (nm == FwNormalizationMode.knmLim) { throw new ArgumentException("Normalization mode may not be knmLim", "nm"); } // NFSC needs to be decomposed first, then recomposed as NFC. if (nm == FwNormalizationMode.knmNFSC && !get_IsNormalizedForm(FwNormalizationMode.knmNFD)) { var nfd = (TsString)get_NormalizedForm(FwNormalizationMode.knmNFD); // Line below is *not* a typo; this call will not recurse infinitely. return(nfd.get_NormalizedFormAndFixOffsets(FwNormalizationMode.knmNFSC, oldOffsetsToFix, numOffsetsToFix)); } bool willFixOffsets = numOffsetsToFix > 0 && oldOffsetsToFix != null && oldOffsetsToFix.IntPtr != IntPtr.Zero; // Keys = offsets into original string, values = offsets into normalized string var stringOffsetMapping = willFixOffsets ? new Dictionary <int, int>() : null; // Don't allocate an object if we'll never use it Icu.UNormalizationMode icuMode = (nm == FwNormalizationMode.knmNFSC) ? Icu.UNormalizationMode.UNORM_NFC : (Icu.UNormalizationMode)nm; IntPtr icuNormalizer = Icu.GetIcuNormalizer(icuMode); TsStrBldr resultBuilder = new TsStrBldr(); int segmentMin = 0; foreach (int segmentLim in EnumerateSegmentLimits(icuNormalizer)) { string segment = GetChars(segmentMin, segmentLim); string normalizedSegment = Icu.Normalize(segment, icuNormalizer); int curRun = get_RunAt(segmentMin); int curRunLim = get_LimOfRun(curRun); ITsTextProps curTextProps = get_Properties(curRun); if (curRunLim >= segmentLim) { // The segment is contained entirely in the current run, so our job is simple int outputLenSoFar = resultBuilder.Length; resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps); // Calculate the orig -> norm index mappings if (and only if) they're needed, since this calculation is expensive if (willFixOffsets) { foreach (RearrangedIndexMapping mapping in MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer)) { // Note that our local mapping is from the start of this segment, but we want to keep track of indexes from the start // of the *string*. (Both the original string and the output, normalized string). So we adjust the indexes here. if (mapping.isFirstCharOfDecomposition) { stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx; } } } } else { // The segment straddles two runs, so our job is harder. We have to either deal with decomposition // rearranging things (and make sure the right characters maintain the right text properties), or // else we have to deal with composition possibly trying to "compress" some diacritics that straddle // a run border (which can happen, for example, if they have different text properties). if (nm == FwNormalizationMode.knmNFD || nm == FwNormalizationMode.knmNFKD) { // Decomposition: we have to deal with rearranging. Some characters from after the first run's // endpoint may have ended up "inside" the first run after rearranging, so their text properties // will be incorrect at first. We'll fix them up after calculating the orig -> norm index mappings. int outputLenSoFar = resultBuilder.Length; // This will be the start index from which resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps); // Now correct the text properties, one index at a time. IEnumerable <RearrangedIndexMapping> indexMappings = MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer); foreach (RearrangedIndexMapping mapping in indexMappings) { ITsTextProps origProperties = get_PropertiesAt(segmentMin + mapping.origIdx); int outputIdx = outputLenSoFar + mapping.normIdx; int size = Char.IsSurrogate(normalizedSegment, mapping.normIdx) ? 2 : 1; resultBuilder.SetProperties(outputIdx, outputIdx + size, origProperties); // And if we also need to fix up offsets at the end, we keep track of the ones we'll need if (willFixOffsets && mapping.isFirstCharOfDecomposition) { stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx; } } } else if (nm == FwNormalizationMode.knmNFSC) { // Composition that preserves styles. By this point, our input is NFD so we at least know there will be no rearranging. // If there is more than one character remaining in the current run, then we might be able to compose those, at least. if (curRunLim - segmentMin > 1) { // Unicode canonical ordering is such that any subsequence of a composed character can itself be composed, so this is safe. string remainderOfFirstRun = GetChars(segmentMin, curRunLim); string normalizedRemainder = Icu.Normalize(remainderOfFirstRun, icuNormalizer); resultBuilder.Replace(resultBuilder.Length, resultBuilder.Length, normalizedRemainder, curTextProps); // Now the start of the un-composable part is just the limit of the first run (which is the start of the second run). segmentMin = curRunLim; } // Now there could be any NUMBER of runs between currentInputIdx and segmentLim. Maybe there are TEN composing // characters, each with different text properties (and thus different runs). However, since the base character // was in the first run, none of the characters from the second or subsequent runs are composable any longer. So we // can copy them to the output as-is as one big TsString, which will carry text, runs and all. ITsString uncomposablePartOfSegment = GetSubstring(segmentMin, segmentLim); resultBuilder.ReplaceTsString(resultBuilder.Length, resultBuilder.Length, uncomposablePartOfSegment); } else { // For NFC and NFKC, we do not try to preserve styles or offset mappings, so this branch is quite simple int outputLenSoFar = resultBuilder.Length; resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps); } } segmentMin = segmentLim; // Next segment will start where the current segment ended } if (willFixOffsets) { stringOffsetMapping[segmentMin] = resultBuilder.Length; int ptrSize = Marshal.SizeOf(typeof(IntPtr)); for (int i = 0; i < numOffsetsToFix; i++) { IntPtr offsetPtr = Marshal.ReadIntPtr(oldOffsetsToFix.IntPtr, i * ptrSize); int oldOffset = Marshal.ReadInt32(offsetPtr); int newOffset; if (stringOffsetMapping.TryGetValue(oldOffset, out newOffset)) { Marshal.WriteInt32(offsetPtr, newOffset); } else { // The only likely way for one of the offsets we've been asked to fix up to NOT // be found in the offset mapping dictionary is if it happened to be an offset // to the second half of a surrogate pair. In which case we want to fix it up to // point to wherever the first half of that pair ended up, so searching downwards // through the offset mapping dictionary will find the best match. bool found = false; while (!found && oldOffset > 0) { oldOffset--; found = stringOffsetMapping.TryGetValue(oldOffset, out newOffset); } // Any offset that could not be matched at all will be pointed at the beginning // of the TsString, since that's safe with strings of all sizes (including empty). Marshal.WriteInt32(offsetPtr, found ? newOffset : 0); } } } var result = (TsString)resultBuilder.GetString(); result.NoteAlreadyNormalized(nm); // So we won't have to do all this work a second time return(result); }
/// <summary> /// Serializes the <see cref="ITsString"/> to XML. /// </summary> public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFactory lgwsf, int ws = 0, bool writeObjData = true, bool indent = false) { // We export only the NFSC form (NFC with exceptions for the parallel style information) ITsString normalizedTss = tss.get_NormalizedForm(FwNormalizationMode.knmNFSC); var xml = new StringBuilder(); var settings = new XmlWriterSettings { OmitXmlDeclaration = true, Indent = true, IndentChars = indent ? " " : string.Empty, NewLineChars = Environment.NewLine }; using (var writer = XmlWriter.Create(xml, settings)) { if (ws > 0) { string id = lgwsf.GetStrFromWs(ws); writer.WriteStartElement("AStr"); writer.WriteAttributeString("ws", Icu.Normalize(id, Icu.UNormalizationMode.UNORM_NFC)); } else { writer.WriteStartElement("Str"); } // Write the properties and text for each run string fieldName = null; for (int i = 0; i < normalizedTss.RunCount; i++) { TsRunInfo tri; ITsTextProps textProps = normalizedTss.FetchRunInfo(i, out tri); string objDataStr; if (textProps.TryGetStringValue(FwTextPropType.ktptObjData, out objDataStr) && !writeObjData) { var chType = (FwObjDataTypes)objDataStr[0]; if (chType == FwObjDataTypes.kodtPictEvenHot || chType == FwObjDataTypes.kodtPictOddHot || chType == FwObjDataTypes.kodtNameGuidHot || chType == FwObjDataTypes.kodtOwnNameGuidHot) { continue; } } string runFieldName; if (textProps.TryGetStringValue(FwTextPropType.ktptFieldName, out runFieldName) && fieldName != runFieldName) { if (!string.IsNullOrEmpty(fieldName)) { writer.WriteEndElement(); } if (!string.IsNullOrEmpty(runFieldName)) { writer.WriteStartElement("Field"); writer.WriteAttributeString("name", runFieldName); } fieldName = runFieldName; } bool markItem; FwTextPropVar var; int markItemValue; if (textProps.TryGetIntValue(FwTextPropType.ktptMarkItem, out var, out markItemValue) && var == FwTextPropVar.ktpvEnum && markItemValue == (int)FwTextToggleVal.kttvForceOn) { writer.WriteStartElement("Item"); writer.WriteStartElement("Run"); markItem = true; } else { writer.WriteStartElement("Run"); markItem = false; } for (int j = 0; j < textProps.IntPropCount; j++) { FwTextPropType tpt; int value = textProps.GetIntProperty(j, out tpt, out var); if (tpt != FwTextPropType.ktptMarkItem) { TsPropsSerializer.WriteIntProperty(writer, lgwsf, tpt, var, value); } } byte[] pict = null; bool hotGuid = false; for (int j = 0; j < textProps.StrPropCount; j++) { FwTextPropType tpt; string value = textProps.GetStringProperty(j, out tpt); TsPropsSerializer.WriteStringProperty(writer, tpt, value); if (tpt == FwTextPropType.ktptObjData && !string.IsNullOrEmpty(value)) { switch ((FwObjDataTypes)value[0]) { // The element data associated with a picture is the actual picture data // since it is much too large to want embedded as an XML attribute value. // (This is an antique kludge that isn't really used in practice, but some // of our test data still exercises it.) case FwObjDataTypes.kodtPictEvenHot: case FwObjDataTypes.kodtPictOddHot: pict = Encoding.Unicode.GetBytes(value.Substring(1)); break; // The generated XML contains both the link value as an attribute and the // (possibly edited) display string as the run's element data. case FwObjDataTypes.kodtExternalPathName: break; // used ONLY in the clipboard...contains XML representation of (currently) a footnote. case FwObjDataTypes.kodtEmbeddedObjectData: break; // The string data associated with this run is assumed to be a dummy magic // character that flags (redundantly for XML) that the actual data to // display is based on the ktptObjData attribute. case FwObjDataTypes.kodtNameGuidHot: case FwObjDataTypes.kodtOwnNameGuidHot: case FwObjDataTypes.kodtContextString: case FwObjDataTypes.kodtGuidMoveableObjDisp: hotGuid = true; break; } } } if (pict != null) { // Write the bytes of the picture data var sb = new StringBuilder(); for (int j = 0; j < pict.Length; j++) { sb.Append(pict[j].ToString("X2")); if (j % 32 == 31) { sb.AppendLine(); } } writer.WriteString(sb.ToString()); } else if (hotGuid) { writer.WriteString(string.Empty); } else { string runText = normalizedTss.get_RunText(i) ?? string.Empty; if (runText != string.Empty && runText.All(char.IsWhiteSpace)) { writer.WriteAttributeString("xml", "space", "", "preserve"); } // TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation writer.WriteString(Icu.Normalize(runText, Icu.UNormalizationMode.UNORM_NFC)); } writer.WriteEndElement(); if (markItem) { writer.WriteEndElement(); } } if (!string.IsNullOrEmpty(fieldName)) { writer.WriteEndElement(); } writer.WriteEndElement(); } return(xml.ToString()); }
/// <summary> /// Get the set of significant digraphs (multigraphs) for the writing system. At the /// moment, these are derived from ICU sorting rules associated with the writing system. /// </summary> private Set <string> GetDigraphs(string sWs, out Dictionary <string, string> mapChars) { Set <string> digraphs = null; if (m_mapWsDigraphs.TryGetValue(sWs, out digraphs)) { mapChars = m_mapWsMapChars[sWs]; return(digraphs); } digraphs = new Set <string>(); mapChars = new Dictionary <string, string>(); int ws = m_cache.LanguageWritingSystemFactoryAccessor.GetWsFromStr(sWs); IWritingSystem wsX = null; ICollation coll = null; string sIcuRules = null; if (ws > 0) { wsX = m_cache.LanguageWritingSystemFactoryAccessor.get_EngineOrNull(ws); if (wsX.CollationCount > 0) { coll = wsX.get_Collation(0); sIcuRules = coll.IcuRules; if (String.IsNullOrEmpty(sIcuRules)) { // The ICU rules may not be loaded for built-in languages, but are // still helpful for our purposes here. string sIcuOrig = sIcuRules; coll.LoadIcuRules(sWs); sIcuRules = coll.IcuRules; coll.IcuRules = sIcuOrig; // but we don't want to actually change anything! } } } if (!String.IsNullOrEmpty(sIcuRules) && sIcuRules.Contains("&")) { string[] rgsRules = sIcuRules.Split(new char[] { '&' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < rgsRules.Length; ++i) { string sRule = rgsRules[i]; // This is a valid rule that specifies that the digraph aa should be ignored // [last tertiary ignorable] = \u02bc = aa // but the code here will ignore this. YAGNI the chances of a user specifying a digraph // as ignorable may never happen. if (sRule.Contains("[")) { sRule = sRule.Substring(0, sRule.IndexOf("[")); } if (String.IsNullOrEmpty(sRule.Trim())) { continue; } sRule = sRule.Replace("<<<", "="); sRule = sRule.Replace("<<", "="); if (sRule.Contains("<")) { // "&N<ng<<<Ng<ny<<<Ny" => "&N<ng=Ng<ny=Ny" // "&N<ñ<<<Ñ" => "&N<ñ=Ñ" // There are other issues we are not handling proplerly such as the next line // &N<\u006e\u0067 string[] rgsPieces = sRule.Split(new char[] { '<', '=' }, StringSplitOptions.RemoveEmptyEntries); for (int j = 0; j < rgsPieces.Length; ++j) { string sGraph = rgsPieces[j]; sGraph = sGraph.Trim(); if (String.IsNullOrEmpty(sGraph)) { continue; } sGraph = Icu.Normalize(sGraph, Icu.UNormalizationMode.UNORM_NFD); if (sGraph.Length > 1) { sGraph = Icu.ToLower(sGraph, sWs); if (!digraphs.Contains(sGraph)) { digraphs.Add(sGraph); } } } } else if (sRule.Contains("=")) { // "&ae<<æ<<<Æ" => "&ae=æ=Æ" string[] rgsPieces = sRule.Split(new char[] { '=' }, StringSplitOptions.RemoveEmptyEntries); string sGraphPrimary = rgsPieces[0].Trim(); Debug.Assert(!String.IsNullOrEmpty(sGraphPrimary)); sGraphPrimary = Icu.ToLower(sGraphPrimary, sWs); for (int j = 1; j < rgsPieces.Length; ++j) { string sGraph = rgsPieces[j]; sGraph = sGraph.Trim(); if (String.IsNullOrEmpty(sGraph)) { continue; } sGraph = Icu.Normalize(sGraph, Icu.UNormalizationMode.UNORM_NFD); sGraph = Icu.ToLower(sGraph, sWs); if (sGraph != sGraphPrimary) { if (!mapChars.ContainsKey(sGraph)) { mapChars.Add(sGraph, sGraphPrimary); } } } } } } m_mapWsDigraphs.Add(sWs, digraphs); m_mapWsMapChars.Add(sWs, mapChars); return(digraphs); }
/// <summary> /// We can't declare these arguments (char * in C++) as [MarshalAs(UnmanagedType.LPStr)] string, because that /// unconditionally coverts the string to bytes using the current system code page, which is never what we want. /// So we declare them as byte[] and marshal like this. The C++ code requires null termination so add a null /// before converting. (This doesn't seem to be necessary, but better safe than sorry.) /// </summary> /// <param name="word"></param> /// <returns></returns> #if __MonoCS__ private static byte[] MarshallAsUtf8Bytes(string word) { return(Encoding.UTF8.GetBytes(Icu.Normalize(word, Icu.UNormalizationMode.UNORM_NFC) + "\0")); }
private static string MarshallAsUtf8Bytes(string word) { byte[] bytes = Encoding.UTF8.GetBytes(Icu.Normalize(word, Icu.UNormalizationMode.UNORM_NFC) + "\0"); return(Encoding.UTF8.GetString(bytes)); }