Пример #1
0
    public void BasicTest(string input, string[] output, TestOptions options = null)
    {
        if (options == null)
        {
            options = new TestOptions();
        }

        var outputList = new List <int> {
            0
        };
        var customBreaker = new CustomBreaker();

        customBreaker.SetNewBreakHandler(vis => outputList.Add(vis.LatestBreakAt));
        //options
        customBreaker.BreakNumberAfterText = options.BreakNumberAfterText;
        customBreaker.EngBreakingEngine.SurrogatePairBreakingOption = options.SurrogatePairBreakingOption;

        //
        customBreaker.BreakWords(input);
        //customBreaker.CopyBreakResults(outputList);
        for (int i = 0; i < outputList.Count - 1; i++)
        {
            Assert.AreEqual
            (
                output[i],
                input.Substring(outputList[i], outputList[i + 1] - outputList[i])
            );
        }
    }
Пример #2
0
        private void cmdManaged_Click(object sender, EventArgs e)
        {
            //some lang eg. Thai, Lao, need dictionary breaking
            //we use dic data from icu-project

            //1. create dictionary based breaking engine
            //TODO: dic should be read once
            var dicProvider = new IcuSimpleTextFileDictionaryProvider()
            {
                DataDir = "../../../icu62/brkitr"
            };

            CustomBreakerBuilder.Setup(dicProvider);
            CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker();

            breaker1.BreakNumberAfterText = true;
            char[] test = this.textBox1.Text.ToCharArray();
            this.listBox1.Items.Clear();

            breaker1.SetNewBreakHandler(vis =>
            {
                BreakSpan span = vis.GetBreakSpan();
                string s       = new string(test, span.startAt, span.len);
                this.listBox1.Items.Add(span.startAt + " " + s);
            });
            breaker1.BreakWords(test, 0, test.Length);

            //foreach (BreakSpan span in breaker1.GetBreakSpanIter())
            //{
            //
            //    this.listBox1.Items.Add(span.startAt + " " + s);
            //}
        }
Пример #3
0
        void InitNewCustomTextBreakerAndBreakWords(char[] inputBuffer)
        {
            //---------------------------
            //we don't have to create a new text breaker everytime.
            //we can reuse it.***

            //this is just a demonstration.
            //---------------------------

            //some lang eg. Thai, Lao, need dictionary breaking
            //we use dic data from icu-project

            //1. create dictionary based breaking engine
            //TODO: dic should be read once
            var dicProvider = new IcuSimpleTextFileDictionaryProvider()
            {
                DataDir = "../../../icu62/brkitr"
            };

            CustomBreakerBuilder.Setup(dicProvider);
            CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker();

            //when we want to break into a group of consecutive unicode ranges. (this does not use Dictionry breaker)
            breaker1.EngBreakingEngine.SurrogatePairBreakingOption = (SurrogatePairBreakingOption)cmbSurrogatePairBreakOptions.SelectedItem;
            breaker1.UseUnicodeRangeBreaker = chkUseUnicodeRangeBreaker.Checked;
            breaker1.BreakNumberAfterText   = true;



            this.listBox1.Items.Clear();
            breaker1.SetNewBreakHandler(vis =>
            {
                BreakSpan span = vis.GetBreakSpan();
                string s       = new string(inputBuffer, span.startAt, span.len);
                this.listBox1.Items.Add(span.startAt + " " + s);
            });

            breaker1.BreakWords(inputBuffer, 0, inputBuffer.Length);

            //foreach (BreakSpan span in breaker1.GetBreakSpanIter())
            //{
            //
            //    this.listBox1.Items.Add(span.startAt + " " + s);
            //}
        }
Пример #4
0
        void ParseWithManaged(int ntimes)
        {
            //-------------------
            var dicProvider = new IcuSimpleTextFileDictionaryProvider()
            {
                DataDir = "../../../icu58/brkitr_src"
            };

            CustomBreakerBuilder.Setup(dicProvider);
            CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker();

            breaker1.SetNewBreakHandler(vis => { }); //just break, do nothing about result
            char[] test = this.textBox1.Text.ToCharArray();
            //-------------
            for (int i = ntimes - 1; i >= 0; --i)
            {
                breaker1.BreakWords(test, 0, test.Length);
            }
        }
Пример #5
0
        void ParseWithManaged(int ntimes)
        {
            //-------------------
            var dicProvider = new IcuSimpleTextFileDictionaryProvider()
            {
                DataDir = "../../../icu58/brkitr_src"
            };

            CustomBreakerBuilder.Setup(dicProvider);
            CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker();

            breaker1.UseUnicodeRangeBreaker = chkUseUnicodeRangeBreaker.Checked;
            breaker1.EngBreakingEngine.SurrogatePairBreakingOption = (SurrogatePairBreakingOption)cmbSurrogatePairBreakOptions.SelectedItem;
            breaker1.SetNewBreakHandler(vis => { }); //just break, do nothing about result
            char[] test = this.textBox1.Text.ToCharArray();
            //-------------
            for (int i = ntimes - 1; i >= 0; --i)
            {
                breaker1.BreakWords(test, 0, test.Length);
            }
        }
Пример #6
0
        private void button1_Click(object sender, EventArgs e)
        {
            //char[] test = this.textBox1.Text.ToCharArray();
            //string test_str = "حب";

            this.listBox1.Items.Clear();

            string test_str = "یہ ایک (car) ہے۔";

            char[] test = test_str.ToCharArray();

            var dicProvider = new IcuSimpleTextFileDictionaryProvider()
            {
                DataDir = "../../../icu58/brkitr_src"
            };

            CustomBreakerBuilder.Setup(dicProvider);
            CustomBreaker breaker1 = CustomBreakerBuilder.NewCustomBreaker();

            breaker1.SetNewBreakHandler(vis =>
            {
                BreakSpan span = vis.GetBreakSpan();
                string s       = new string(test, span.startAt, span.len);
                this.listBox1.Items.Add(span.startAt + " " + s);
            }); //just break, do nothing about result



            breaker1.BreakWords(test);


            //for (int i = 0; i < outputList.Count - 1; i++)
            //{
            //    Assert.AreEqual
            //    (
            //        output[i],
            //        input.Substring(outputList[i], outputList[i + 1] - outputList[i])
            //    );
            //}
        }
Пример #7
0
    public void WordKindTest()
    {
        var breaker = new CustomBreaker {
            ThrowIfCharOutOfRange = true
        };
        var breakList = new List <BreakSpan>();

        char[] test = "«Maître leçon»".ToCharArray();

        breaker.SetNewBreakHandler(vis => breakList.Add(vis.GetBreakSpan()));


#warning Use `breaker.BreakWords("«Maître leçon»", breakList);` once #156 is merged

        breaker.BreakWords(test, 0, test.Length);

        Assert.AreEqual(breakList.Count, 5);
        void BreakSpanEqual(BreakSpan actual, BreakSpan expected)
        {
            Assert.AreEqual(expected.startAt, actual.startAt);
            Assert.AreEqual(expected.len, actual.len);
            Assert.AreEqual(expected.wordKind, actual.wordKind);
        }

        BreakSpanEqual(breakList[0], new BreakSpan {
            startAt = 0, len = 1, wordKind = WordKind.Punc
        });
        BreakSpanEqual(breakList[1], new BreakSpan {
            startAt = 1, len = 6, wordKind = WordKind.Text
        });
        BreakSpanEqual(breakList[2], new BreakSpan {
            startAt = 7, len = 1, wordKind = WordKind.Whitespace
        });
        BreakSpanEqual(breakList[3], new BreakSpan {
            startAt = 8, len = 5, wordKind = WordKind.Text
        });
        BreakSpanEqual(breakList[4], new BreakSpan {
            startAt = 13, len = 1, wordKind = WordKind.Punc
        });
    }
Пример #8
0
    public void BasicTest(string input, string[] output, bool breakNumberAfterText = false)
    {
        var outputList = new List <int> {
            0
        };
        var customBreaker = new CustomBreaker();

        customBreaker.SetNewBreakHandler(vis => outputList.Add(vis.LatestBreakAt));

        customBreaker.BreakNumberAfterText = breakNumberAfterText;
        //
        customBreaker.BreakWords(input);


        //customBreaker.CopyBreakResults(outputList);
        for (int i = 0; i < outputList.Count - 1; i++)
        {
            Assert.AreEqual
            (
                output[i],
                input.Substring(outputList[i], outputList[i + 1] - outputList[i])
            );
        }
    }
Пример #9
0
 public MyManagedTextBreaker()
 {
     //TODO: review config folder here
     _textBreaker = CustomBreakerBuilder.NewCustomBreaker();
     _textBreaker.SetNewBreakHandler(vis => _breakAtList.Add(vis.LatestBreakAt));
 }
Пример #10
0
        public static Result <TextAtom> TextAtomFromLaTeX(string latexSource)
        {
            if (string.IsNullOrEmpty(latexSource))
            {
                return(new TextAtom.List(Array.Empty <TextAtom>(), 0));
            }
            bool?         displayMath     = null;
            StringBuilder mathLaTeX       = null;
            bool          backslashEscape = false;
            bool          afterCommand    = false; //ignore spaces after command
            bool          afterNewline    = false;
            int           dollarCount     = 0;
            var           globalAtoms     = new TextAtomListBuilder();
            var           breakList       = new List <BreakAtInfo>();

            breaker.SetNewBreakHandler(v =>
                                       breakList.Add(new BreakAtInfo(v.LatestBreakAt, v.LatestWordKind)));
            breaker.BreakWords(latexSource);

            Result CheckDollarCount(TextAtomListBuilder atoms)
            {
                switch (dollarCount)
                {
                case 0:
                    break;

                case 1:
                    dollarCount = 0;
                    switch (displayMath)
                    {
                    case true:
                        return("Cannot close display math mode with $");

                    case false:
                        if (atoms.Math(mathLaTeX.ToString(), false).Error is string mathError)
                        {
                            return("[Math mode error] " + mathError);
                        }
                        mathLaTeX   = null;
                        displayMath = null;
                        break;

                    case null:
                        mathLaTeX   = new StringBuilder();
                        displayMath = false;
                        break;
                    }
                    break;

                case 2:
                    dollarCount = 0;
                    switch (displayMath)
                    {
                    case true:
                        if (atoms.Math(mathLaTeX.ToString(), true).Error is string mathError)
                        {
                            return("[Math mode error] " + mathError);
                        }
                        mathLaTeX   = null;
                        displayMath = null;
                        break;

                    case false:
                        return("Cannot close inline math mode with $$");

                    case null:
                        mathLaTeX   = new StringBuilder();
                        displayMath = true;
                        break;
                    }
                    break;

                default:
                    return("Invalid number of $: " + dollarCount);
                }
                return(Ok());
            }

            Result <int> BuildBreakList(ReadOnlySpan <char> latex, TextAtomListBuilder atoms,
                                        int i, bool oneCharOnly, char stopChar)
            {
                void ParagraphBreak()
                {
                    atoms.Break(3);
#warning Should the newline and space occupy the same range?
                    atoms.TextLength -= 3;
                    atoms.Space(Space.ParagraphIndent, 3);
                }

                for (; i < breakList.Count; i++)
                {
                    void ObtainSection(ReadOnlySpan <char> latexInput, int index,
                                       out int start, out int end, out ReadOnlySpan <char> section, out WordKind kind)
                    {
                        (start, end) = (index == 0 ? 0 : breakList[index - 1].breakAt, breakList[index].breakAt);
                        section      = latexInput.Slice(start, end - start);
                        kind         = breakList[index].wordKind;
                    }

                    ObtainSection(latex, i, out var startAt, out var endAt, out var textSection, out var wordKind);
                    bool PreviousSection(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section)
                    {
                        bool success = i-- > 0;

                        if (success)
                        {
                            ObtainSection(latexInput, i, out startAt, out endAt, out section, out wordKind);
                        }
                        return(success);
                    }

                    bool NextSection(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section)
                    {
                        bool success = ++i < breakList.Count;

                        if (success)
                        {
                            ObtainSection(latexInput, i, out startAt, out endAt, out section, out wordKind);
                        }
                        return(success);
                    }

                    Result <TextAtom.List> ReadArgumentAtom(ReadOnlySpan <char> latexInput)
                    {
                        backslashEscape = false;
                        var argAtoms = new TextAtomListBuilder();

                        return(BuildBreakList(latexInput, argAtoms, ++i, true, '\0')
                               .Bind(index => { i = index; return argAtoms.Build(); }));
                    }

                    SpanResult <char> ReadArgumentString(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section)
                    {
                        afterCommand = false;
                        if (!NextSection(latexInput, ref section))
                        {
                            return(Err("Missing argument"));
                        }
                        if (section.IsNot('{'))
                        {
                            return(Err("Missing {"));
                        }
                        int endingIndex = -1;
                        //startAt + 1 to not start at the { we started at
                        bool isEscape = false;

                        for (int j = startAt + 1, bracketDepth = 0; j < latexInput.Length; j++)
                        {
                            if (latexInput[j] == '\\')
                            {
                                isEscape = true;
                            }
                            else if (latexInput[j] == '{' && !isEscape)
                            {
                                bracketDepth++;
                            }
                            else if (latexInput[j] == '}' && !isEscape)
                            {
                                if (bracketDepth > 0)
                                {
                                    bracketDepth--;
                                }
                                else
                                {
                                    endingIndex = j; break;
                                }
                            }
                            else
                            {
                                isEscape = false;
                            }
                        }
                        if (endingIndex == -1)
                        {
                            return(Err("Missing }"));
                        }
                        var resultText = latexInput.Slice(endAt, endingIndex - endAt);

                        while (startAt < endingIndex)
                        {
                            _ = NextSection(latexInput, ref section); //this never fails because the above check
                        }
                        return(Ok(resultText));
                    }

                    ReadOnlySpan <char> NextSectionUntilPunc(ReadOnlySpan <char> latexInput, ref ReadOnlySpan <char> section)
                    {
                        int start = endAt;
                        ReadOnlySpan <char> specialChars = stackalloc[] { '#', '$', '%', '&', '\\', '^', '_', '{', '}', '~' };

                        while (NextSection(latexInput, ref section))
                        {
                            if (wordKind != WordKind.Punc || specialChars.IndexOf(section[0]) != -1)
                            {
                                //We have overlooked by one
                                PreviousSection(latexInput, ref section);
                                break;
                            }
                        }
                        return(latexInput.Slice(start, endAt - start));
                    }

                    //Nothing should be before dollar sign checking -- dollar sign checking uses continue;
                    atoms.TextLength = startAt;
                    if (textSection.Is('$'))
                    {
                        if (backslashEscape)
                        {
                            if (displayMath != null)
                            {
                                mathLaTeX.Append(@"\$");
                            }
                            else
                            {
                                atoms.Text("$", NextSectionUntilPunc(latex, ref textSection));
                            }
                        }
                        else
                        {
                            dollarCount++;
                            continue;
                        }
                        backslashEscape = false;
                    }
                    else
                    {
                        { if (CheckDollarCount(atoms).Error is string error)
                          {
                              return(error);
                          }
                        }
                        if (!backslashEscape)
                        {
                            //Unescaped text section, inside display/inline math mode
                            if (displayMath != null)
                            {
                                switch (textSection)
                                {
                                case var _ when textSection.Is('$'):
                                    throw new InvalidCodePathException("The $ case should have been accounted for.");

                                case var _ when textSection.Is('\\'):
                                    backslashEscape = true;

                                    continue;

                                default:
                                    mathLaTeX.Append(textSection);
                                    break;
                                }
                            }
                            //Unescaped text section, not inside display/inline math mode
                            else
                            {
                                switch (textSection)
                                {
                                case var _ when stopChar > 0 && textSection[0] == stopChar:
                                    return(Ok(i));

                                case var _ when textSection.Is('$'):
                                    throw new InvalidCodePathException("The $ case should have been accounted for.");

                                case var _ when textSection.Is('\\'):
                                    backslashEscape = true;

                                    continue;

                                case var _ when textSection.Is('#'):
                                    return("Unexpected command argument reference character # outside of new command definition (currently unsupported)");

                                case var _ when textSection.Is('^'):
                                case var _ when textSection.Is('_'):
                                    return($"Unexpected script indicator {textSection[0]} outside of math mode");

                                case var _ when textSection.Is('&'):
                                    return($"Unexpected alignment tab character & outside of table environments");

                                case var _ when textSection.Is('~'):
                                    atoms.ControlSpace();

                                    break;

                                case var _ when textSection.Is('%'):
                                    var comment = new StringBuilder();

                                    while (NextSection(latex, ref textSection) && wordKind != WordKind.NewLine)
                                    {
                                        comment.Append(textSection);
                                    }
                                    atoms.Comment(comment.ToString());
                                    break;

                                case var _ when textSection.Is('{'):
                                    if (BuildBreakList(latex, atoms, ++i, false, '}').Bind(index => i = index).Error is string error)
                                    {
                                        return(error);
                                    }
                                    break;

                                case var _ when textSection.Is('}'):
                                    return("Unexpected }, unbalanced braces");

                                case var _ when wordKind == WordKind.NewLine:
                                    // Consume newlines after commands
                                    // Double newline == paragraph break
                                    if (afterNewline)
                                    {
                                        ParagraphBreak();
                                        afterNewline = false;
                                        break;
                                    }
                                    else
                                    {
                                        atoms.ControlSpace();
                                        afterNewline = true;
                                        continue;
                                    }

                                case var _ when wordKind == WordKind.Whitespace:
                                    //Collpase spaces
                                    if (afterCommand)
                                    {
                                        continue;
                                    }
                                    else
                                    {
                                        atoms.ControlSpace();
                                    }
                                    break;

                                default: //Just ordinary text
                                    if (oneCharOnly)
                                    {
                                        if (startAt + 1 < endAt) //Only re-read if current break span is more than 1 long
                                        {
                                            i--;
                                            breakList[i] = new BreakAtInfo(breakList[i].breakAt + 1, breakList[i].wordKind);
                                        }
                                        //Need to allocate in the end :(
                                        //Don't look ahead for punc; we are looking for one char only
                                        atoms.Text(textSection[0].ToString(), default);
                                    }
                                    else
                                    {
                                        atoms.Text(textSection.ToString(), NextSectionUntilPunc(latex, ref textSection));
                                    }
                                    break;
                                }
                            }
                            afterCommand = false;
                        }

                        //Escaped text section but in inline/display math mode
                        else if (displayMath != null)
                        {
                            switch (textSection)
                            {
                            case var _ when textSection.Is('$'):
                                throw new InvalidCodePathException("The $ case should have been accounted for.");

                            case var _ when textSection.Is('('):
                                return(displayMath switch
                                {
                                    true => "Cannot open inline math mode in display math mode",
                                    false => "Cannot open inline math mode in inline math mode",
                                    null => throw new InvalidCodePathException("displayMath is null. This switch should not be hit."),
                                });
Пример #11
0
    public void EngEngine()
    {
        //Text source: https://en.wikibooks.org/wiki/French/Texts/Simple/Le_Corbeau_et_le_Renard
        const string Le_Corbeau_et_le_Renard         = @"
Maître Corbeau, sur un arbre perché,
Tenait en son bec un fromage.
Maître Renard, par l’odeur alléché,
Lui tint à peu près ce langage :
« Hé ! bonjour, Monsieur du Corbeau.
Que vous êtes joli ! Que vous me semblez beau !
Sans mentir, si votre ramage
Se rapporte à votre plumage,
Vous êtes le Phénix des hôtes de ces bois. »
A ces mots le Corbeau ne se sent pas de joie ;
Et pour montrer sa belle voix,
Il ouvre un large bec, laisse tomber sa proie.
Le Renard s’en saisit, et dit : « Mon bon Monsieur,
Apprenez que tout flatteur
Vit aux dépens de celui qui l’écoute :
Cette leçon vaut bien un fromage, sans doute. »
Le Corbeau, honteux et confus,
Jura, mais un peu tard, qu’on ne l’y prendrait plus.";
        const string Le_Corbeau_et_le_Renard__Broken = @"
|Maître| |Corbeau|,| |sur| |un| |arbre| |perché|,|
|Tenait| |en| |son| |bec| |un| |fromage.|
|Maître| |Renard|,| |par| |l|’|odeur| |alléché|,|
|Lui| |tint| |à| |peu| |près| |ce| |langage| |:|
|«| |Hé| |!| |bonjour|,| |Monsieur| |du| |Corbeau.|
|Que| |vous| |êtes| |joli| |!| |Que| |vous| |me| |semblez| |beau| |!|
|Sans| |mentir|,| |si| |votre| |ramage|
|Se| |rapporte| |à| |votre| |plumage|,|
|Vous| |êtes| |le| |Phénix| |des| |hôtes| |de| |ces| |bois.| |»|
|A| |ces| |mots| |le| |Corbeau| |ne| |se| |sent| |pas| |de| |joie| |;|
|Et| |pour| |montrer| |sa| |belle| |voix|,|
|Il| |ouvre| |un| |large| |bec|,| |laisse| |tomber| |sa| |proie.|
|Le| |Renard| |s|’|en| |saisit|,| |et| |dit| |:| |«| |Mon| |bon| |Monsieur|,|
|Apprenez| |que| |tout| |flatteur|
|Vit| |aux| |dépens| |de| |celui| |qui| |l|’|écoute| |:|
|Cette| |leçon| |vaut| |bien| |un| |fromage|,| |sans| |doute.| |»|
|Le| |Corbeau|,| |honteux| |et| |confus|,|
|Jura|,| |mais| |un| |peu| |tard|,| |qu|’|on| |ne| |l|’|y| |prendrait| |plus.|";

        string BreakText(string text, string seperator = "|")
        {
            var breaker = new CustomBreaker {
                ThrowIfCharOutOfRange = true
            };
            var breakList = new List <BreakAtInfo>();

            breaker.SetNewBreakHandler(vis => breakList.Add(new BreakAtInfo(vis.LatestBreakAt, vis.LatestWordKind)));


#warning Use `breaker.BreakWords(text, breakList);` once #156 is merged

            breaker.BreakWords(text);
            //breaker.CopyBreakResults(breakList);


            var sb = new StringBuilder(text);
            //reverse to ensure earlier inserts do not affect later ones
            foreach (var @break in breakList.Select(i => i.breakAt).Reverse())
            {
                sb = sb.Insert(@break, seperator);
            }
            return(sb.ToString());
        }

        var brokenString = BreakText(Le_Corbeau_et_le_Renard);
        Assert.AreEqual(Le_Corbeau_et_le_Renard__Broken, brokenString);
    }