Example #1
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Member AddUnicodeProp
        /// </summary>
        /// <param name="tag">tag</param>
        /// <param name="ws">ws</param>
        /// <param name="_vwvc">_vwvc</param>
        /// ------------------------------------------------------------------------------------
        public override void AddUnicodeProp(int tag, int ws, IVwViewConstructor _vwvc)
        {
            CurrentContext ccOld = WriteFieldStartTag(tag);
            string         sText = DataAccess.get_UnicodeProp(CurrentObject(), tag);

            // Need to ensure that sText is NFC for export.
            Icu.InitIcuDataDir();
            if (!Icu.IsNormalized(sText, Icu.UNormalizationMode.UNORM_NFC))
            {
                sText = Icu.Normalize(sText, Icu.UNormalizationMode.UNORM_NFC);
            }
            string sWs = WritingSystemId(ws);

            IndentLine();
            if (String.IsNullOrEmpty(sWs))
            {
                m_writer.WriteLine("<Uni>{0}</Uni>", XmlUtils.MakeSafeXml(sText));
            }
            else
            {
                m_writer.WriteLine("<AUni ws=\"{0}\">{1}</AUni>",
                                   sWs, XmlUtils.MakeSafeXml(sText));
            }
            WriteFieldEndTag(tag, ccOld);
        }
Example #2
0
        public void Normalize_NFC2NFC()
        {
            var normalizedString = Icu.Normalize("tést", Icu.UNormalizationMode.UNORM_NFC);

            Assert.AreEqual("tést", normalizedString);
            Assert.IsTrue(normalizedString.IsNormalized(NormalizationForm.FormC));
        }
Example #3
0
        private static void InitDictionary(string dicPath, IEnumerable <string> words)
        {
            var affixFile = Path.ChangeExtension(dicPath, ".aff");

            if (!File.Exists(affixFile))
            {
                using (var writer = FileUtils.OpenFileForWrite(affixFile, Encoding.UTF8))
                {
                    writer.WriteLine("SET UTF-8");
                    // Enhance JohnT: may be helpful to write TRY followed by the word-forming and possibly punctuation
                    // characters of the language. This somehow affects the suggestions, but I haven't figured out how yet.
                    writer.WriteLine("KEEPCASE " + keepCaseFlag);
                }
            }
            // If it already exists, probably we disabled it by deleting the .aff file--an approach we
            // no longer use; re-creating it should reinstate it.
            using (var writer = FileUtils.OpenFileForWrite(dicPath, Encoding.UTF8))
            {
                // This is a size of hash table to allocate, NOT the exact number of words in the dictionary.
                // In particular it must NOT be zero or Hunspell will malfunction (divide by zero).
                // However, making it equal the number of words helps Hunspell allocate a good size of hashtable.
                writer.WriteLine(Math.Max(10, words.Count()).ToString());
                writer.WriteLine(PrototypeWord + "/" + keepCaseFlag);
                foreach (var word in words)
                {
                    writer.WriteLine(Icu.Normalize(word, Icu.UNormalizationMode.UNORM_NFC));
                }
            }
        }
Example #4
0
        public bool UpdateWordform(IWfiWordform wordform, ParserPriority priority)
        {
            CheckDisposed();

            int       wordformHash = 0;
            ITsString form         = null;
            int       hvo          = 0;

            using (new WorkerThreadReadHelper(m_cache.ServiceLocator.GetInstance <IWorkerThreadReadHandler>()))
            {
                if (wordform.IsValidObject)
                {
                    wordformHash = wordform.Checksum;
                    form         = wordform.Form.VernacularDefaultWritingSystem;
                }
            }
            // 'form' will now be null, if it could not find the wordform for whatever reason.
            // uiCRCWordform will also now be 0, if 'form' is null.
            if (form == null || string.IsNullOrEmpty(form.Text))
            {
                return(false);
            }

            CheckNeedsUpdate();
            ParseResult result = m_parser.ParseWord(Icu.Normalize(form.Text.Replace(' ', '.'), Icu.UNormalizationMode.UNORM_NFD));

            if (wordformHash == result.GetHashCode())
            {
                return(false);
            }

            return(m_parseFiler.ProcessParse(wordform, priority, result));
        }
Example #5
0
        private string GetOneWordformResult(int hvoWordform, string form)
        {
            Debug.Assert(hvoWordform > 0, "Wordform ID must be greater than zero.");
            Debug.Assert(form != null, "Wordform form must not be null.");

            try
            {
                Trace.WriteLineIf(tracingSwitch.TraceInfo, "GetOneWordformResult(): CurrentThreadId = " + Win32.GetCurrentThreadId().ToString());
                DateTime startTime = DateTime.Now;
                //Debug.WriteLine("Begin parsing wordform " + form);
                string results = ParseWord(Icu.Normalize(form, Icu.UNormalizationMode.UNORM_NFD), hvoWordform);
                //Debug.WriteLine("After parsing wordform " + form);
                long ttlTicks = DateTime.Now.Ticks - startTime.Ticks;
                m_ticksParser += ttlTicks;
                DebugMsg("ParseWord(" + form + ") took : " + ttlTicks.ToString());
                return(Icu.Normalize(results, Icu.UNormalizationMode.UNORM_NFD));
            }
            catch (Exception error)
            {
                Trace.WriteLineIf(tracingSwitch.TraceError, "The word '"
                                  + form
                                  + "', id='"
                                  + hvoWordform.ToString()
                                  + "' failed to parse. error was: "
                                  + error.Message);
                //might as well keep going.
                //TODO: create an problem object since we could not parse this word.
                throw new ApplicationException("Error while parsing '" + form + "'.", error);
            }
        }
Example #6
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Handles creating simple TsStrings that contain one run with only a writing system.
        /// </summary>
        /// <param name="rootXml">The element including the Str or AStr tag</param>
        /// <param name="lgwsf">The writing system factory.</param>
        /// <returns>The created TsString or null if the XML was too complext to be handled
        /// by this method.</returns>
        /// ------------------------------------------------------------------------------------
        private static ITsString HandleSimpleString(XElement rootXml, ILgWritingSystemFactory lgwsf)
        {
            if (rootXml.Elements().Count() != 1)
            {
                return(null);
            }

            XElement textElement = rootXml.Elements().First();

            if (textElement.Name.LocalName != "Run")
            {
                return(null);                // probably an error, anyway not simple case we are optimizing.
            }
            int cTextElementAtribs = textElement.Attributes().Count();

            if (cTextElementAtribs != 1)
            {
                return(null);                // Way too complex for this simple case
            }
            XAttribute wsAttribute = textElement.Attributes().First();

            if (wsAttribute.Name.LocalName != "ws")
            {
                return(null);                // we handle only single runs with only the ws attribute.
            }
            // Make sure the text is in the decomposed form (FWR-148)
            string runText = Icu.Normalize(textElement.Value, Icu.UNormalizationMode.UNORM_NFD);

            return(TsStringUtils.MakeString(runText, GetWsForId(wsAttribute.Value, lgwsf)));
        }
Example #7
0
        private void WriteLetterHeadIfNeeded(string sEntry, string sWs)
        {
            string sLower = GetLeadChar(Icu.Normalize(sEntry, Icu.UNormalizationMode.UNORM_NFD), sWs);
            string sTitle = Icu.ToTitle(sLower, sWs);

            if (sTitle != m_schCurrent)
            {
                if (m_schCurrent.Length > 0)
                {
                    m_writer.WriteLine("</div>");                       // for letData
                }
                m_writer.WriteLine("<div class=\"letHead\">");
                StringBuilder sb = new StringBuilder();
                if (!String.IsNullOrEmpty(sTitle) && sTitle != sLower)
                {
                    sb.Append(sTitle.Normalize());
                    sb.Append(' ');
                }
                if (!String.IsNullOrEmpty(sLower))
                {
                    sb.Append(sLower.Normalize());
                }
                m_writer.WriteLine("<div class=\"letter\">{0}</div>", XmlUtils.MakeSafeXml(sb.ToString()));
                m_writer.WriteLine("</div>");
                m_writer.WriteLine("<div class=\"letData\">");
                m_schCurrent = sTitle;
            }
        }
Example #8
0
        public void Normalize_NFD2NFD()
        {
            var normalizedString = Icu.Normalize("te\u0301st", Icu.UNormalizationMode.UNORM_NFD);

            Assert.AreEqual("te\u0301st", normalizedString);
            Assert.IsTrue(normalizedString.IsNormalized(NormalizationForm.FormD));
        }
Example #9
0
        /// <summary>
        /// Try parsing a wordform, optionally getting a trace of the parse
        /// </summary>
        /// <param name="sForm">the word form to parse</param>
        /// <param name="fDoTrace">whether or not to trace the parse</param>
        /// <param name="sSelectTraceMorphs">list of msa hvos to limit trace to </param>
        public void TryAWord(string sForm, bool fDoTrace, string sSelectTraceMorphs)
        {
            CheckDisposed();

            if (sForm == null)
            {
                throw new ArgumentNullException("sForm", "TryAWord cannot trace a Null string.");
            }
            if (sForm == String.Empty)
            {
                throw new ArgumentException("Can't try a word with no content.", "sForm");
            }

            CheckNeedsUpdate();
            using (var task = new TaskReport(string.Format(ParserCoreStrings.ksTraceWordformX, sForm), m_taskUpdateHandler))
            {
                var normForm = Icu.Normalize(sForm, Icu.UNormalizationMode.UNORM_NFD);
                var result   = fDoTrace ? TraceWord(normForm, sSelectTraceMorphs) : ParseWord(normForm, 0);
                if (fDoTrace)
                {
                    task.Details = result;
                }
                else
                {
                    task.Details = Icu.Normalize(result, Icu.UNormalizationMode.UNORM_NFD);
                }
            }
        }
Example #10
0
        public void SetStatus(string word1, bool isCorrect)
        {
            var word = Icu.Normalize(word1, Icu.UNormalizationMode.UNORM_NFC);

            if (Check(word) == isCorrect)
            {
                return;                 // nothing to do.
            }
            // Review: any IO exceptions we should handle? How??
            SetInternalStatus(word, isCorrect);
            var  builder             = new StringBuilder();
            bool insertedLineForWord = false;

            if (File.Exists(ExceptionPath))
            {
                using (var reader = new StreamReader(ExceptionPath, Encoding.UTF8))
                {
                    string line;
                    while ((line = reader.ReadLine()) != null)
                    {
                        var  item    = line;
                        bool correct = true;
                        if (item.Length > 0 && item[0] == '*')
                        {
                            correct = false;
                            item    = item.Substring(1);
                        }
                        // If we already got it, or the current line is before the word, just copy the line to the output.
                        if (insertedLineForWord || String.Compare(item, word, System.StringComparison.Ordinal) < 0)
                        {
                            builder.AppendLine(line);
                            continue;
                        }
                        // We've come to the right place to insert our word.
                        if (!isCorrect)
                        {
                            builder.Append("*");
                        }
                        builder.AppendLine(word);
                        insertedLineForWord = true;
                        if (word != item)                         // then current line must be a pre-existing word that comes after ours.
                        {
                            builder.AppendLine(line);             // so add it in after item
                        }
                    }
                }
            }
            if (!insertedLineForWord)             // no input file, or the word comes after any existing one
            {
                // The very first exception!
                if (!isCorrect)
                {
                    builder.Append("*");
                }
                builder.AppendLine(word);
            }
            // Write the new file over the old one.
            File.WriteAllText(ExceptionPath, builder.ToString(), Encoding.UTF8);
        }
Example #11
0
        /// <summary>
        /// Load plain C# string. Interpreset XML entities as the appropriate characters.
        /// </summary>
        internal static string LoadUnicodeString(XElement reader)
        {
            if (reader == null)
            {
                throw new ArgumentNullException("reader");
            }

            return(Icu.Normalize(reader.Element("Uni").Value, Icu.UNormalizationMode.UNORM_NFD));               // return NFD.
        }
Example #12
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Handles a complex string that contains multiple runs with optional multiple
        /// text props applied.
        /// </summary>
        /// <param name="xml">The XML.</param>
        /// <param name="lgwsf">The writing system factory.</param>
        /// <returns>The created TsString</returns>
        /// ------------------------------------------------------------------------------------
        private static ITsString HandleComplexString(XElement xml, ILgWritingSystemFactory lgwsf)
        {
            var runs = xml.Elements("Run");

            if (runs.Count() == 0)
            {
                if (xml.Name.LocalName == "AStr" && xml.Attributes().Count() == 1)
                {
                    // This duplicates a little bit of code from HandleSimpleRun, but I wanted to keep that really simple
                    // and fast, and this case hardly ever happens...maybe not at all in real life.
                    XAttribute wsAttribute = xml.Attributes().First();
                    if (wsAttribute.Name.LocalName != "ws")
                    {
                        return(null);                        // we handle only single runs with only the ws attribute.
                    }
                    // Make sure the text is in the decomposed form (FWR-148)
                    string runText = Icu.Normalize(xml.Value, Icu.UNormalizationMode.UNORM_NFD);
                    return(TsStringUtils.MakeString(runText, GetWsForId(wsAttribute.Value, lgwsf)));
                }
                return(null);                   // If we don't have any runs, we don't have a string!
            }

            var strBldr = TsStringUtils.MakeIncStrBldr();

            foreach (XElement runElement in runs)
            {
                if (runElement == null)
                {
                    throw new XmlSchemaException("TsString XML must contain a <Run> element contained in a <" + xml.Name.LocalName + "> element");
                }
                string runText = runElement.Value;
                if (runElement.Attribute("ws") == null && (runText.Length == 0 || runText[0] > 13))
                {
                    throw new XmlSchemaException("Run element must contain a ws attribute. Run text: " + runElement.Value);
                }

                // Make sure the text is in the decomposed form (FWR-148)
                runText = Icu.Normalize(runText, Icu.UNormalizationMode.UNORM_NFD);
                bool isOrcNeeded = TsPropsSerializer.GetPropAttributesForElement(runElement, lgwsf, strBldr);

                // Add an ORC character, if needed, for the run
                if (runText.Length == 0 && isOrcNeeded)
                {
                    runText = StringUtils.kszObject;
                }

                // Add the text with the properties to the builder
                strBldr.Append(runText);
            }

            return(strBldr.GetString());
        }
Example #13
0
        public void Normalize_NFC2NFD()
        {
            var normalizedString = Icu.Normalize("tést", Icu.UNormalizationMode.UNORM_NFD);
            var i = 0;

            foreach (var c in normalizedString.ToCharArray())
            {
                Console.WriteLine("pos {0}: {1} ({1:x})", i++, c);
            }
            Assert.AreEqual(0x0301, normalizedString[2]);
            Assert.AreEqual("te\u0301st", normalizedString);
            Assert.IsTrue(normalizedString.IsNormalized(NormalizationForm.FormD));
        }
Example #14
0
        /// <summary>
        /// Write the xml for the unicode alternative.
        /// </summary>
        /// <param name="writer"></param>
        /// <param name="wsf"></param>
        /// <param name="ws"></param>
        /// <param name="alternative"></param>
        protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int ws, ITsString alternative)
        {
            var text = alternative.Text;

            if (string.IsNullOrEmpty(text))
            {
                return;                 // Skip writing TsStrings with no content.
            }
            writer.WriteStartElement("AUni");
            writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id);
            text = Icu.Normalize(text, Icu.UNormalizationMode.UNORM_NFC);
            writer.WriteString(text);
            writer.WriteEndElement();
        }
Example #15
0
        private string GetOneWordformResult(int hvoWordform, string form)
        {
            Debug.Assert(hvoWordform > 0, "Wordform ID must be greater than zero.");
            Debug.Assert(form != null, "Wordform form must not be null.");

            Trace.WriteLineIf(m_tracingSwitch.TraceInfo, "GetOneWordformResult(): CurrentThreadId = " + Win32.GetCurrentThreadId());
            var  startTime = DateTime.Now;
            var  results   = ParseWord(Icu.Normalize(form, Icu.UNormalizationMode.UNORM_NFD), hvoWordform);
            long ttlTicks  = DateTime.Now.Ticks - startTime.Ticks;

            m_ticksParser += ttlTicks;
            m_numberOfWordForms++;
            Trace.WriteLineIf(m_tracingSwitch.TraceInfo, "ParseWord(" + form + ") took : " + ttlTicks);
            return(Icu.Normalize(results, Icu.UNormalizationMode.UNORM_NFD));
        }
Example #16
0
        /// <summary>
        /// Try parsing a wordform, optionally getting a trace of the parse
        /// </summary>
        /// <param name="sForm">the word form to parse</param>
        /// <param name="fDoTrace">whether or not to trace the parse</param>
        /// <param name="sSelectTraceMorphs">list of msa hvos to limit trace to </param>
        internal void TryAWord(string sForm, bool fDoTrace, string sSelectTraceMorphs)
        {
            CheckDisposed();

            if (sForm == null)
            {
                throw new ArgumentNullException("sForm", "TryAWord cannot trace a Null string.");
            }
            if (sForm == String.Empty)
            {
                throw new ArgumentException("Can't try a word with no content.", "sForm");
            }

            using (TaskReport task = new TaskReport(
                       String.Format(ParserCoreStrings.ksTraceWordformX, sForm),
                       m_taskUpdateHandler))
            {
                try
                {
                    string normForm = Icu.Normalize(sForm, Icu.UNormalizationMode.UNORM_NFD);
                    string result   = null;
                    if (fDoTrace)
                    {
                        //Debug.WriteLine("Begin tracing wordform " + sForm);
                        result = TraceWord(normForm, sSelectTraceMorphs);
                        //Debug.WriteLine("After tacing wordform " + sForm);
                        //Debug.WriteLine("Result of trace: " + task.Details);
                    }
                    else
                    {
                        result = ParseWord(normForm, 0);
                    }
                    task.Details = Icu.Normalize(result, Icu.UNormalizationMode.UNORM_NFD);
                    return;
                }
                catch (Exception error)
                {
                    Trace.WriteLineIf(tracingSwitch.TraceError, "The word '"
                                      + sForm
                                      + "' failed to parse. error was: "
                                      + error.Message);
                    task.EncounteredError(null);                        // Don't want to show message box in addition to yellow crash box!
                    //might as well keep going.
                    //TODO: create an problem object since we could not parse this word.
                    throw new ApplicationException("Error while parsing '" + sForm + "'.", error);
                }
            }
        }
Example #17
0
        /// <summary>
        /// Write ordinary C# string. Angle brackets and ampersand are converted to XML entities.
        /// </summary>
        internal static void WriteUnicodeString(XmlWriter writer, string elementName, string propertyData)
        {
            if (writer == null)
            {
                throw new ArgumentNullException("writer");
            }
            if (string.IsNullOrEmpty(elementName))
            {
                throw new ArgumentNullException("elementName");
            }

            if (string.IsNullOrEmpty(propertyData))
            {
                return;
            }

            writer.WriteStartElement(elementName);                                             // Open prop. element.
            writer.WriteStartElement("Uni");                                                   // Open Uni element.
            writer.WriteString(Icu.Normalize(propertyData, Icu.UNormalizationMode.UNORM_NFC)); // Store NFC.
            writer.WriteEndElement();                                                          // Close Uni element.
            writer.WriteEndElement();                                                          // Close prop. element.
        }
Example #18
0
        static internal int ReadMultiUnicodeAlternative(XElement aUniNode, ILgWritingSystemFactory wsf, ITsStrFactory tsf, out ITsString tss)
        {
            tss = null;
            var sValue = aUniNode.Value;

            if (String.IsNullOrEmpty(sValue))
            {
                return(0);
            }
            var wsVal = aUniNode.Attribute("ws");

            if (wsVal == null || String.IsNullOrEmpty(wsVal.Value))
            {
                return(0);
            }
            var wsHvo = wsf.GetWsFromStr(wsVal.Value);

            // Throwing out a string without a ws is probably better than crashing
            // and preventing a db from being opened.
            // This code currently accepts this data, only storing en__IPA and fr strings.
            // <Form>
            // <AUni ws="en" />
            // <AUni ws="en__IPA">problematic</AUni>
            // <AUni>missing</AUni>
            // <AUni></AUni>
            // <AUni ws="fr">french</AUni>
            // <AUni/>
            // </Form>
            if (wsHvo == 0)
            {
                return(0);
            }
            var text = Icu.Normalize(sValue, Icu.UNormalizationMode.UNORM_NFD);

            tss = tsf.MakeString(text, wsHvo);
            return(wsHvo);
        }
Example #19
0
        /// <summary>
        /// Handle the CommitText event. Transfer to GUI thread if neccessary.
        /// </summary>
        private void CommitTextEventHandler(string text)
        {
            if (AssociatedSimpleRootSite.InvokeRequired)
            {
                AssociatedSimpleRootSite.SafeBeginInvoke(
                    new CommitDelegate(CommitTextEventHandler), new object[] { text });
                return;
            }

            IActionHandler actionHandler = AssociatedSimpleRootSite.DataAccess.GetActionHandler();

            try
            {
                if (actionHandler != null)
                {
                    actionHandler.BeginUndoTask(Resources.ksUndoTyping, Resources.ksRedoTyping);
                }

                // Save existing Preedit Selection and existing left-over preedit string.
                var       preeditSelection = SavePreeditSelection();
                ITsString preedit;
                preeditSelection.Selection.GetSelectionString(out preedit, String.Empty);

                // Change selection to a insertion point (unless we moved the selection before,
                // which happens when we come here as part of processing a mouse click)
                // And insert commit text.
                var selHelper = new SelectionHelper(
                    m_savedPreeditSelection ?? AssociatedSimpleRootSite.EditingHelper.CurrentSelection);
                if (m_savedPreeditSelection == null)
                {
                    selHelper.ReduceToIp(SelectionHelper.SelLimitType.Anchor);
                }

                selHelper.SetSelection(true);
                AssociatedSimpleRootSite.EditingHelper.OnCharAux(text, VwShiftStatus.kfssNone,
                                                                 Keys.None);

                int deletedChars = TrimBeginningBackspaces(ref text);

                // Update the saved preedit selection to take account of the inserted text
                // text is in NFC, but the view uses it in NFD, so we have to convert it.
                // We don't do this if we moved the selection prior to this method.
                int textLenNFD = m_savedPreeditSelection != null ? 0 :
                                 Icu.Normalize(text, Icu.UNormalizationMode.UNORM_NFD).Length;
                int anchor = preeditSelection.IchAnchor + textLenNFD - deletedChars;

                // select the text we just inserted
                // TODO: underline the text so that it is more obvious that this is just preedit text
                preeditSelection.SetIch(SelectionHelper.SelLimitType.Anchor, anchor);
                preeditSelection.SetIch(SelectionHelper.SelLimitType.End,
                                        preeditSelection.IchEnd + textLenNFD - deletedChars);

                // reshow the preedit selection
                RestorePreeditSelection(preeditSelection);
                preeditSelection.Selection.ReplaceWithTsString(preedit);
                preeditSelection.SetSelection(true);
            }
            finally
            {
                m_savedPreeditSelection = null;
                if (actionHandler != null)
                {
                    actionHandler.EndUndoTask();
                }
            }
        }
Example #20
0
        // Implementation of both get_NormalizedForm and NfdAndFixOffsets
        private ITsString get_NormalizedFormAndFixOffsets(FwNormalizationMode nm, ArrayPtr oldOffsetsToFix, int numOffsetsToFix)
        {
            // Can we skip unnecessary work?
            if (IsAlreadyNormalized(nm))
            {
                return(this);
            }
            if (string.IsNullOrEmpty(Text))
            {
                NoteAlreadyNormalized(nm);
                return(this);
            }

            if (nm == FwNormalizationMode.knmLim)
            {
                throw new ArgumentException("Normalization mode may not be knmLim", "nm");
            }

            // NFSC needs to be decomposed first, then recomposed as NFC.
            if (nm == FwNormalizationMode.knmNFSC && !get_IsNormalizedForm(FwNormalizationMode.knmNFD))
            {
                var nfd = (TsString)get_NormalizedForm(FwNormalizationMode.knmNFD);
                // Line below is *not* a typo; this call will not recurse infinitely.
                return(nfd.get_NormalizedFormAndFixOffsets(FwNormalizationMode.knmNFSC, oldOffsetsToFix, numOffsetsToFix));
            }

            bool willFixOffsets = numOffsetsToFix > 0 && oldOffsetsToFix != null && oldOffsetsToFix.IntPtr != IntPtr.Zero;
            // Keys = offsets into original string, values = offsets into normalized string
            var stringOffsetMapping = willFixOffsets ? new Dictionary <int, int>() : null;            // Don't allocate an object if we'll never use it

            Icu.UNormalizationMode icuMode = (nm == FwNormalizationMode.knmNFSC) ? Icu.UNormalizationMode.UNORM_NFC : (Icu.UNormalizationMode)nm;
            IntPtr icuNormalizer           = Icu.GetIcuNormalizer(icuMode);

            TsStrBldr resultBuilder = new TsStrBldr();
            int       segmentMin    = 0;

            foreach (int segmentLim in EnumerateSegmentLimits(icuNormalizer))
            {
                string       segment           = GetChars(segmentMin, segmentLim);
                string       normalizedSegment = Icu.Normalize(segment, icuNormalizer);
                int          curRun            = get_RunAt(segmentMin);
                int          curRunLim         = get_LimOfRun(curRun);
                ITsTextProps curTextProps      = get_Properties(curRun);
                if (curRunLim >= segmentLim)
                {
                    // The segment is contained entirely in the current run, so our job is simple
                    int outputLenSoFar = resultBuilder.Length;
                    resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps);
                    // Calculate the orig -> norm index mappings if (and only if) they're needed, since this calculation is expensive
                    if (willFixOffsets)
                    {
                        foreach (RearrangedIndexMapping mapping in MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer))
                        {
                            // Note that our local mapping is from the start of this segment, but we want to keep track of indexes from the start
                            // of the *string*. (Both the original string and the output, normalized string). So we adjust the indexes here.
                            if (mapping.isFirstCharOfDecomposition)
                            {
                                stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx;
                            }
                        }
                    }
                }
                else
                {
                    // The segment straddles two runs, so our job is harder. We have to either deal with decomposition
                    // rearranging things (and make sure the right characters maintain the right text properties), or
                    // else we have to deal with composition possibly trying to "compress" some diacritics that straddle
                    // a run border (which can happen, for example, if they have different text properties).

                    if (nm == FwNormalizationMode.knmNFD || nm == FwNormalizationMode.knmNFKD)
                    {
                        // Decomposition: we have to deal with rearranging. Some characters from after the first run's
                        // endpoint may have ended up "inside" the first run after rearranging, so their text properties
                        // will be incorrect at first. We'll fix them up after calculating the orig -> norm index mappings.

                        int outputLenSoFar = resultBuilder.Length;                         // This will be the start index from which
                        resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps);

                        // Now correct the text properties, one index at a time.
                        IEnumerable <RearrangedIndexMapping> indexMappings = MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer);
                        foreach (RearrangedIndexMapping mapping in indexMappings)
                        {
                            ITsTextProps origProperties = get_PropertiesAt(segmentMin + mapping.origIdx);
                            int          outputIdx      = outputLenSoFar + mapping.normIdx;
                            int          size           = Char.IsSurrogate(normalizedSegment, mapping.normIdx) ? 2 : 1;
                            resultBuilder.SetProperties(outputIdx, outputIdx + size, origProperties);
                            // And if we also need to fix up offsets at the end, we keep track of the ones we'll need
                            if (willFixOffsets && mapping.isFirstCharOfDecomposition)
                            {
                                stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx;
                            }
                        }
                    }

                    else if (nm == FwNormalizationMode.knmNFSC)
                    {
                        // Composition that preserves styles. By this point, our input is NFD so we at least know there will be no rearranging.

                        // If there is more than one character remaining in the current run, then we might be able to compose those, at least.
                        if (curRunLim - segmentMin > 1)
                        {
                            // Unicode canonical ordering is such that any subsequence of a composed character can itself be composed, so this is safe.
                            string remainderOfFirstRun = GetChars(segmentMin, curRunLim);
                            string normalizedRemainder = Icu.Normalize(remainderOfFirstRun, icuNormalizer);
                            resultBuilder.Replace(resultBuilder.Length, resultBuilder.Length, normalizedRemainder, curTextProps);
                            // Now the start of the un-composable part is just the limit of the first run (which is the start of the second run).
                            segmentMin = curRunLim;
                        }
                        // Now there could be any NUMBER of runs between currentInputIdx and segmentLim. Maybe there are TEN composing
                        // characters, each with different text properties (and thus different runs). However, since the base character
                        // was in the first run, none of the characters from the second or subsequent runs are composable any longer. So we
                        // can copy them to the output as-is as one big TsString, which will carry text, runs and all.
                        ITsString uncomposablePartOfSegment = GetSubstring(segmentMin, segmentLim);
                        resultBuilder.ReplaceTsString(resultBuilder.Length, resultBuilder.Length, uncomposablePartOfSegment);
                    }

                    else
                    {
                        // For NFC and NFKC, we do not try to preserve styles or offset mappings, so this branch is quite simple
                        int outputLenSoFar = resultBuilder.Length;
                        resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps);
                    }
                }
                segmentMin = segmentLim;                 // Next segment will start where the current segment ended
            }
            if (willFixOffsets)
            {
                stringOffsetMapping[segmentMin] = resultBuilder.Length;
                int ptrSize = Marshal.SizeOf(typeof(IntPtr));
                for (int i = 0; i < numOffsetsToFix; i++)
                {
                    IntPtr offsetPtr = Marshal.ReadIntPtr(oldOffsetsToFix.IntPtr, i * ptrSize);
                    int    oldOffset = Marshal.ReadInt32(offsetPtr);
                    int    newOffset;
                    if (stringOffsetMapping.TryGetValue(oldOffset, out newOffset))
                    {
                        Marshal.WriteInt32(offsetPtr, newOffset);
                    }
                    else
                    {
                        // The only likely way for one of the offsets we've been asked to fix up to NOT
                        // be found in the offset mapping dictionary is if it happened to be an offset
                        // to the second half of a surrogate pair. In which case we want to fix it up to
                        // point to wherever the first half of that pair ended up, so searching downwards
                        // through the offset mapping dictionary will find the best match.
                        bool found = false;
                        while (!found && oldOffset > 0)
                        {
                            oldOffset--;
                            found = stringOffsetMapping.TryGetValue(oldOffset, out newOffset);
                        }
                        // Any offset that could not be matched at all will be pointed at the beginning
                        // of the TsString, since that's safe with strings of all sizes (including empty).
                        Marshal.WriteInt32(offsetPtr, found ? newOffset : 0);
                    }
                }
            }
            var result = (TsString)resultBuilder.GetString();

            result.NoteAlreadyNormalized(nm);             // So we won't have to do all this work a second time
            return(result);
        }
Example #21
0
        /// <summary>
        /// Serializes the <see cref="ITsString"/> to XML.
        /// </summary>
        public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFactory lgwsf, int ws = 0, bool writeObjData = true, bool indent = false)
        {
            // We export only the NFSC form (NFC with exceptions for the parallel style information)
            ITsString normalizedTss = tss.get_NormalizedForm(FwNormalizationMode.knmNFSC);

            var xml      = new StringBuilder();
            var settings = new XmlWriterSettings
            {
                OmitXmlDeclaration = true,
                Indent             = true,
                IndentChars        = indent ? "  " : string.Empty,
                NewLineChars       = Environment.NewLine
            };

            using (var writer = XmlWriter.Create(xml, settings))
            {
                if (ws > 0)
                {
                    string id = lgwsf.GetStrFromWs(ws);
                    writer.WriteStartElement("AStr");
                    writer.WriteAttributeString("ws", Icu.Normalize(id, Icu.UNormalizationMode.UNORM_NFC));
                }
                else
                {
                    writer.WriteStartElement("Str");
                }

                // Write the properties and text for each run
                string fieldName = null;
                for (int i = 0; i < normalizedTss.RunCount; i++)
                {
                    TsRunInfo    tri;
                    ITsTextProps textProps = normalizedTss.FetchRunInfo(i, out tri);
                    string       objDataStr;
                    if (textProps.TryGetStringValue(FwTextPropType.ktptObjData, out objDataStr) && !writeObjData)
                    {
                        var chType = (FwObjDataTypes)objDataStr[0];
                        if (chType == FwObjDataTypes.kodtPictEvenHot || chType == FwObjDataTypes.kodtPictOddHot ||
                            chType == FwObjDataTypes.kodtNameGuidHot || chType == FwObjDataTypes.kodtOwnNameGuidHot)
                        {
                            continue;
                        }
                    }

                    string runFieldName;
                    if (textProps.TryGetStringValue(FwTextPropType.ktptFieldName, out runFieldName) && fieldName != runFieldName)
                    {
                        if (!string.IsNullOrEmpty(fieldName))
                        {
                            writer.WriteEndElement();
                        }
                        if (!string.IsNullOrEmpty(runFieldName))
                        {
                            writer.WriteStartElement("Field");
                            writer.WriteAttributeString("name", runFieldName);
                        }
                        fieldName = runFieldName;
                    }

                    bool          markItem;
                    FwTextPropVar var;
                    int           markItemValue;
                    if (textProps.TryGetIntValue(FwTextPropType.ktptMarkItem, out var, out markItemValue) &&
                        var == FwTextPropVar.ktpvEnum && markItemValue == (int)FwTextToggleVal.kttvForceOn)
                    {
                        writer.WriteStartElement("Item");
                        writer.WriteStartElement("Run");
                        markItem = true;
                    }
                    else
                    {
                        writer.WriteStartElement("Run");
                        markItem = false;
                    }

                    for (int j = 0; j < textProps.IntPropCount; j++)
                    {
                        FwTextPropType tpt;
                        int            value = textProps.GetIntProperty(j, out tpt, out var);
                        if (tpt != FwTextPropType.ktptMarkItem)
                        {
                            TsPropsSerializer.WriteIntProperty(writer, lgwsf, tpt, var, value);
                        }
                    }

                    byte[] pict    = null;
                    bool   hotGuid = false;
                    for (int j = 0; j < textProps.StrPropCount; j++)
                    {
                        FwTextPropType tpt;
                        string         value = textProps.GetStringProperty(j, out tpt);
                        TsPropsSerializer.WriteStringProperty(writer, tpt, value);
                        if (tpt == FwTextPropType.ktptObjData && !string.IsNullOrEmpty(value))
                        {
                            switch ((FwObjDataTypes)value[0])
                            {
                            // The element data associated with a picture is the actual picture data
                            // since it is much too large to want embedded as an XML attribute value.
                            // (This is an antique kludge that isn't really used in practice, but some
                            // of our test data still exercises it.)
                            case FwObjDataTypes.kodtPictEvenHot:
                            case FwObjDataTypes.kodtPictOddHot:
                                pict = Encoding.Unicode.GetBytes(value.Substring(1));
                                break;

                            // The generated XML contains both the link value as an attribute and the
                            // (possibly edited) display string as the run's element data.
                            case FwObjDataTypes.kodtExternalPathName:
                                break;

                            // used ONLY in the clipboard...contains XML representation of (currently) a footnote.
                            case FwObjDataTypes.kodtEmbeddedObjectData:
                                break;

                            // The string data associated with this run is assumed to be a dummy magic
                            // character that flags (redundantly for XML) that the actual data to
                            // display is based on the ktptObjData attribute.
                            case FwObjDataTypes.kodtNameGuidHot:
                            case FwObjDataTypes.kodtOwnNameGuidHot:
                            case FwObjDataTypes.kodtContextString:
                            case FwObjDataTypes.kodtGuidMoveableObjDisp:
                                hotGuid = true;
                                break;
                            }
                        }
                    }

                    if (pict != null)
                    {
                        // Write the bytes of the picture data
                        var sb = new StringBuilder();
                        for (int j = 0; j < pict.Length; j++)
                        {
                            sb.Append(pict[j].ToString("X2"));
                            if (j % 32 == 31)
                            {
                                sb.AppendLine();
                            }
                        }
                        writer.WriteString(sb.ToString());
                    }
                    else if (hotGuid)
                    {
                        writer.WriteString(string.Empty);
                    }
                    else
                    {
                        string runText = normalizedTss.get_RunText(i) ?? string.Empty;
                        if (runText != string.Empty && runText.All(char.IsWhiteSpace))
                        {
                            writer.WriteAttributeString("xml", "space", "", "preserve");
                        }
                        // TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation
                        writer.WriteString(Icu.Normalize(runText, Icu.UNormalizationMode.UNORM_NFC));
                    }

                    writer.WriteEndElement();
                    if (markItem)
                    {
                        writer.WriteEndElement();
                    }
                }
                if (!string.IsNullOrEmpty(fieldName))
                {
                    writer.WriteEndElement();
                }
                writer.WriteEndElement();
            }
            return(xml.ToString());
        }
Example #22
0
        /// <summary>
        /// Get the set of significant digraphs (multigraphs) for the writing system.  At the
        /// moment, these are derived from ICU sorting rules associated with the writing system.
        /// </summary>
        private Set <string> GetDigraphs(string sWs, out Dictionary <string, string> mapChars)
        {
            Set <string> digraphs = null;

            if (m_mapWsDigraphs.TryGetValue(sWs, out digraphs))
            {
                mapChars = m_mapWsMapChars[sWs];
                return(digraphs);
            }
            digraphs = new Set <string>();
            mapChars = new Dictionary <string, string>();
            int            ws        = m_cache.LanguageWritingSystemFactoryAccessor.GetWsFromStr(sWs);
            IWritingSystem wsX       = null;
            ICollation     coll      = null;
            string         sIcuRules = null;

            if (ws > 0)
            {
                wsX = m_cache.LanguageWritingSystemFactoryAccessor.get_EngineOrNull(ws);
                if (wsX.CollationCount > 0)
                {
                    coll      = wsX.get_Collation(0);
                    sIcuRules = coll.IcuRules;
                    if (String.IsNullOrEmpty(sIcuRules))
                    {
                        // The ICU rules may not be loaded for built-in languages, but are
                        // still helpful for our purposes here.
                        string sIcuOrig = sIcuRules;
                        coll.LoadIcuRules(sWs);
                        sIcuRules     = coll.IcuRules;
                        coll.IcuRules = sIcuOrig;                               // but we don't want to actually change anything!
                    }
                }
            }
            if (!String.IsNullOrEmpty(sIcuRules) && sIcuRules.Contains("&"))
            {
                string[] rgsRules = sIcuRules.Split(new char[] { '&' }, StringSplitOptions.RemoveEmptyEntries);
                for (int i = 0; i < rgsRules.Length; ++i)
                {
                    string sRule = rgsRules[i];
                    // This is a valid rule that specifies that the digraph aa should be ignored
                    // [last tertiary ignorable] = \u02bc = aa
                    // but the code here will ignore this. YAGNI the chances of a user specifying a digraph
                    // as ignorable may never happen.
                    if (sRule.Contains("["))
                    {
                        sRule = sRule.Substring(0, sRule.IndexOf("["));
                    }
                    if (String.IsNullOrEmpty(sRule.Trim()))
                    {
                        continue;
                    }
                    sRule = sRule.Replace("<<<", "=");
                    sRule = sRule.Replace("<<", "=");
                    if (sRule.Contains("<"))
                    {
                        // "&N<ng<<<Ng<ny<<<Ny" => "&N<ng=Ng<ny=Ny"
                        // "&N<ñ<<<Ñ" => "&N<ñ=Ñ"
                        // There are other issues we are not handling proplerly such as the next line
                        // &N<\u006e\u0067
                        string[] rgsPieces = sRule.Split(new char[] { '<', '=' }, StringSplitOptions.RemoveEmptyEntries);
                        for (int j = 0; j < rgsPieces.Length; ++j)
                        {
                            string sGraph = rgsPieces[j];
                            sGraph = sGraph.Trim();
                            if (String.IsNullOrEmpty(sGraph))
                            {
                                continue;
                            }
                            sGraph = Icu.Normalize(sGraph, Icu.UNormalizationMode.UNORM_NFD);
                            if (sGraph.Length > 1)
                            {
                                sGraph = Icu.ToLower(sGraph, sWs);
                                if (!digraphs.Contains(sGraph))
                                {
                                    digraphs.Add(sGraph);
                                }
                            }
                        }
                    }
                    else if (sRule.Contains("="))
                    {
                        // "&ae<<æ<<<Æ" => "&ae=æ=Æ"
                        string[] rgsPieces     = sRule.Split(new char[] { '=' }, StringSplitOptions.RemoveEmptyEntries);
                        string   sGraphPrimary = rgsPieces[0].Trim();
                        Debug.Assert(!String.IsNullOrEmpty(sGraphPrimary));
                        sGraphPrimary = Icu.ToLower(sGraphPrimary, sWs);
                        for (int j = 1; j < rgsPieces.Length; ++j)
                        {
                            string sGraph = rgsPieces[j];
                            sGraph = sGraph.Trim();
                            if (String.IsNullOrEmpty(sGraph))
                            {
                                continue;
                            }
                            sGraph = Icu.Normalize(sGraph, Icu.UNormalizationMode.UNORM_NFD);
                            sGraph = Icu.ToLower(sGraph, sWs);
                            if (sGraph != sGraphPrimary)
                            {
                                if (!mapChars.ContainsKey(sGraph))
                                {
                                    mapChars.Add(sGraph, sGraphPrimary);
                                }
                            }
                        }
                    }
                }
            }
            m_mapWsDigraphs.Add(sWs, digraphs);
            m_mapWsMapChars.Add(sWs, mapChars);
            return(digraphs);
        }
Example #23
0
        /// <summary>
        /// We can't declare these arguments (char * in C++) as [MarshalAs(UnmanagedType.LPStr)] string, because that
        /// unconditionally coverts the string to bytes using the current system code page, which is never what we want.
        /// So we declare them as byte[] and marshal like this. The C++ code requires null termination so add a null
        /// before converting. (This doesn't seem to be necessary, but better safe than sorry.)
        /// </summary>
        /// <param name="word"></param>
        /// <returns></returns>
#if __MonoCS__
        private static byte[] MarshallAsUtf8Bytes(string word)
        {
            return(Encoding.UTF8.GetBytes(Icu.Normalize(word, Icu.UNormalizationMode.UNORM_NFC) + "\0"));
        }
Example #24
0
 private static string MarshallAsUtf8Bytes(string word)
 {
     byte[] bytes = Encoding.UTF8.GetBytes(Icu.Normalize(word, Icu.UNormalizationMode.UNORM_NFC) + "\0");
     return(Encoding.UTF8.GetString(bytes));
 }