Esempio n. 1
0
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position pos, bool isIncremental)
        {
            int start = pos.Start;
            int limit = pos.Limit;
            int i, ipat;

            //loop:
            while (start < limit)
            {
                // Loop over the forms in spec[].  Exit this loop when we
                // match one of the specs.  Exit the outer loop if a
                // partial match is detected and isIncremental is true.
                for (ipat = 0; spec[ipat] != END;)
                {
                    // Read the header
                    int prefixLen = spec[ipat++];
                    int suffixLen = spec[ipat++];
                    int radix     = spec[ipat++];
                    int minDigits = spec[ipat++];
                    int maxDigits = spec[ipat++];

                    // s is a copy of start that is advanced over the
                    // characters as we parse them.
                    int  s     = start;
                    bool match = true;

                    for (i = 0; i < prefixLen; ++i)
                    {
                        if (s >= limit)
                        {
                            if (i > 0)
                            {
                                // We've already matched a character.  This is
                                // a partial match, so we return if in
                                // incremental mode.  In non-incremental mode,
                                // go to the next spec.
                                if (isIncremental)
                                {
                                    goto loop_break;
                                }
                                match = false;
                                break;
                            }
                        }
                        char c = text[s++];
                        if (c != spec[ipat + i])
                        {
                            match = false;
                            break;
                        }
                    }

                    if (match)
                    {
                        int u          = 0;
                        int digitCount = 0;
                        for (; ;)
                        {
                            if (s >= limit)
                            {
                                // Check for partial match in incremental mode.
                                if (s > start && isIncremental)
                                {
                                    goto loop_break;
                                }
                                break;
                            }
                            int ch    = text.Char32At(s);
                            int digit = UCharacter.Digit(ch, radix);
                            if (digit < 0)
                            {
                                break;
                            }
                            s += UTF16.GetCharCount(ch);
                            u  = (u * radix) + digit;
                            if (++digitCount == maxDigits)
                            {
                                break;
                            }
                        }

                        match = (digitCount >= minDigits);

                        if (match)
                        {
                            for (i = 0; i < suffixLen; ++i)
                            {
                                if (s >= limit)
                                {
                                    // Check for partial match in incremental mode.
                                    if (s > start && isIncremental)
                                    {
                                        goto loop_break;
                                    }
                                    match = false;
                                    break;
                                }
                                char c = text[s++];
                                if (c != spec[ipat + prefixLen + i])
                                {
                                    match = false;
                                    break;
                                }
                            }

                            if (match)
                            {
                                // At this point, we have a match
                                string str = UTF16.ValueOf(u);
                                text.Replace(start, s, str);
                                limit -= s - start - str.Length;
                                // The following break statement leaves the
                                // loop that is traversing the forms in
                                // spec[].  We then parse the next input
                                // character.
                                break;
                            }
                        }
                    }

                    ipat += prefixLen + suffixLen;
                }

                if (start < limit)
                {
                    start += UTF16.GetCharCount(text.Char32At(start));
                }
            }
            loop_break : { }

            pos.ContextLimit += limit - pos.Limit;
            pos.Limit         = limit;
            pos.Start         = start;
        }
Esempio n. 2
0
        public void TestExtended()
        {
            TestParams tp = new TestParams();


            //
            //  Open and read the test data file.
            //
            StringBuilder testFileBuf = new StringBuilder();
            Stream        @is         = null;

            try
            {
                @is = typeof(RBBITestExtended).GetTypeInfo().Assembly.GetManifestResourceStream("ICU4N.Dev.Test.Rbbi.rbbitst.txt");
                if (@is == null)
                {
                    Errln("Could not open test data file rbbitst.txt");
                    return;
                }
                StreamReader isr = new StreamReader(@is, Encoding.UTF8);
                try
                {
                    int c;
                    int count = 0;
                    for (; ;)
                    {
                        c = isr.Read();
                        if (c < 0)
                        {
                            break;
                        }
                        count++;
                        if (c == 0xFEFF && count == 1)
                        {
                            // BOM in the test data file. Discard it.
                            continue;
                        }

                        testFileBuf.AppendCodePoint(c);
                    }
                }
                finally
                {
                    isr.Dispose();
                }
            }
            catch (IOException e)
            {
                Errln(e.ToString());
                try
                {
                    @is.Dispose();
                }
                catch (IOException ignored)
                {
                }
                return;
            }

            String testString = testFileBuf.ToString();


            const int PARSE_COMMENT = 1;
            const int PARSE_TAG     = 2;
            const int PARSE_DATA    = 3;
            const int PARSE_NUM     = 4;
            const int PARSE_RULES   = 5;

            int parseState = PARSE_TAG;

            int savedState = PARSE_TAG;

            int lineNum  = 1;
            int colStart = 0;
            int column   = 0;
            int charIdx  = 0;
            int i;

            int tagValue = 0;                                   // The numeric value of a <nnn> tag.

            StringBuilder rules          = new StringBuilder(); // Holds rules from a <rules> ... </rules> block
            int           rulesFirstLine = 0;                   // Line number of the start of current <rules> block

            int len = testString.Length;

            for (charIdx = 0; charIdx < len;)
            {
                int c = testString.CodePointAt(charIdx);
                charIdx++;
                if (c == '\r' && charIdx < len && testString[charIdx] == '\n')
                {
                    // treat CRLF as a unit
                    c = '\n';
                    charIdx++;
                }
                if (c == '\n' || c == '\r')
                {
                    lineNum++;
                    colStart = charIdx;
                }
                column = charIdx - colStart + 1;

                switch (parseState)
                {
                case PARSE_COMMENT:
                    if (c == 0x0a || c == 0x0d)
                    {
                        parseState = savedState;
                    }
                    break;

                case PARSE_TAG:
                {
                    if (c == '#')
                    {
                        parseState = PARSE_COMMENT;
                        savedState = PARSE_TAG;
                        break;
                    }
                    if (UCharacter.IsWhitespace(c))
                    {
                        break;
                    }
                    if (testString.StartsWith("<word>", charIdx - 1))
                    {
                        tp.bi    = BreakIterator.GetWordInstance(tp.currentLocale);
                        charIdx += 5;
                        break;
                    }
                    if (testString.StartsWith("<char>", charIdx - 1))
                    {
                        tp.bi    = BreakIterator.GetCharacterInstance(tp.currentLocale);
                        charIdx += 5;
                        break;
                    }
                    if (testString.StartsWith("<line>", charIdx - 1))
                    {
                        tp.bi    = BreakIterator.GetLineInstance(tp.currentLocale);
                        charIdx += 5;
                        break;
                    }
                    if (testString.StartsWith("<sent>", charIdx - 1))
                    {
                        tp.bi    = BreakIterator.GetSentenceInstance(tp.currentLocale);
                        charIdx += 5;
                        break;
                    }
                    if (testString.StartsWith("<title>", charIdx - 1))
                    {
                        tp.bi    = BreakIterator.GetTitleInstance(tp.currentLocale);
                        charIdx += 6;
                        break;
                    }
                    if (testString.StartsWith("<rules>", charIdx - 1) ||
                        testString.StartsWith("<badrules>", charIdx - 1))
                    {
                        charIdx        = testString.IndexOf('>', charIdx) + 1;
                        parseState     = PARSE_RULES;
                        rules.Length   = (0);
                        rulesFirstLine = lineNum;
                        break;
                    }

                    if (testString.StartsWith("<locale ", charIdx - 1))
                    {
                        int closeIndex = testString.IndexOf(">", charIdx);
                        if (closeIndex < 0)
                        {
                            Errln("line" + lineNum + ": missing close on <locale  tag.");
                            break;
                        }
                        String localeName = testString.Substring(charIdx + 6, closeIndex - (charIdx + 6));         // ICU4N: Corrected 2nd parameter
                        localeName       = localeName.Trim();
                        tp.currentLocale = new ULocale(localeName);
                        charIdx          = closeIndex + 1;
                        break;
                    }
                    if (testString.StartsWith("<data>", charIdx - 1))
                    {
                        parseState            = PARSE_DATA;
                        charIdx              += 5;
                        tp.dataToBreak.Length = (0);
                        Arrays.Fill(tp.expectedBreaks, 0);
                        Arrays.Fill(tp.srcCol, 0);
                        Arrays.Fill(tp.srcLine, 0);
                        break;
                    }

                    Errln("line" + lineNum + ": Tag expected in test file.");
                    return;
                    //parseState = PARSE_COMMENT;
                    //savedState = PARSE_DATA;
                }

                case PARSE_RULES:
                    if (testString.StartsWith("</rules>", charIdx - 1))
                    {
                        charIdx   += 7;
                        parseState = PARSE_TAG;
                        try
                        {
                            tp.bi = new RuleBasedBreakIterator(rules.ToString());
                        }
                        catch (ArgumentException e)
                        {
                            Errln(String.Format("rbbitst.txt:{0}  Error creating break iterator from rules.  {1}", lineNum, e));
                        }
                    }
                    else if (testString.StartsWith("</badrules>", charIdx - 1))
                    {
                        charIdx   += 10;
                        parseState = PARSE_TAG;
                        bool goodRules = true;
                        try
                        {
                            new RuleBasedBreakIterator(rules.ToString());
                        }
                        catch (ArgumentException e)
                        {
                            goodRules = false;
                        }
                        if (goodRules)
                        {
                            Errln(String.Format(
                                      "rbbitst.txt:{0}  Expected, but did not get, a failure creating break iterator from rules.",
                                      lineNum));
                        }
                    }
                    else
                    {
                        rules.AppendCodePoint(c);
                    }
                    break;

                case PARSE_DATA:
                    if (c == '•')
                    {
                        int breakIdx = tp.dataToBreak.Length;
                        tp.expectedBreaks[breakIdx] = -1;
                        tp.srcLine[breakIdx]        = lineNum;
                        tp.srcCol[breakIdx]         = column;
                        break;
                    }

                    if (testString.StartsWith("</data>", charIdx - 1))
                    {
                        // Add final entry to mappings from break location to source file position.
                        //  Need one extra because last break position returned is after the
                        //    last char in the data, not at the last char.
                        int idx = tp.dataToBreak.Length;
                        tp.srcLine[idx] = lineNum;
                        tp.srcCol[idx]  = column;

                        parseState = PARSE_TAG;
                        charIdx   += 6;

                        // RUN THE TEST!
                        executeTest(tp);
                        break;
                    }

                    if (testString.StartsWith("\\N{", charIdx - 1))
                    {
                        int nameEndIdx = testString.IndexOf('}', charIdx);
                        if (nameEndIdx == -1)
                        {
                            Errln("Error in named character in test file at line " + lineNum +
                                  ", col " + column);
                        }
                        // Named character, e.g. \N{COMBINING GRAVE ACCENT}
                        // Get the code point from the name and insert it into the test data.
                        String charName = testString.Substring(charIdx + 2, nameEndIdx - (charIdx + 2));     // ICU4N: Corrected 2nd parameter
                        c = UCharacter.GetCharFromName(charName);
                        if (c == -1)
                        {
                            Errln("Error in named character in test file at line " + lineNum +
                                  ", col " + column);
                        }
                        else
                        {
                            // Named code point was recognized.  Insert it
                            //   into the test data.
                            tp.dataToBreak.AppendCodePoint(c);
                            for (i = tp.dataToBreak.Length - 1; i >= 0 && tp.srcLine[i] == 0; i--)
                            {
                                tp.srcLine[i] = lineNum;
                                tp.srcCol[i]  = column;
                            }
                        }
                        if (nameEndIdx > charIdx)
                        {
                            charIdx = nameEndIdx + 1;
                        }
                        break;
                    }

                    if (testString.StartsWith("<>", charIdx - 1))
                    {
                        charIdx++;
                        int breakIdx = tp.dataToBreak.Length;
                        tp.expectedBreaks[breakIdx] = -1;
                        tp.srcLine[breakIdx]        = lineNum;
                        tp.srcCol[breakIdx]         = column;
                        break;
                    }

                    if (c == '<')
                    {
                        tagValue   = 0;
                        parseState = PARSE_NUM;
                        break;
                    }

                    if (c == '#' && column == 3)
                    {       // TODO:  why is column off so far?
                        parseState = PARSE_COMMENT;
                        savedState = PARSE_DATA;
                        break;
                    }

                    if (c == '\\')
                    {
                        // Check for \ at end of line, a line continuation.
                        //     Advance over (discard) the newline
                        int cp = testString.CodePointAt(charIdx);
                        if (cp == '\r' && charIdx < len && testString.CodePointAt(charIdx + 1) == '\n')
                        {
                            // We have a CR LF
                            //  Need an extra increment of the input ptr to move over both of them
                            charIdx++;
                        }
                        if (cp == '\n' || cp == '\r')
                        {
                            lineNum++;
                            column = 0;
                            charIdx++;
                            colStart = charIdx;
                            break;
                        }

                        // Let unescape handle the back slash.
                        int[] charIdxAr = new int[1];
                        charIdxAr[0] = charIdx;
                        cp           = Utility.UnescapeAt(testString, charIdxAr);
                        if (cp != -1)
                        {
                            // Escape sequence was recognized.  Insert the char
                            //   into the test data.
                            charIdx = charIdxAr[0];
                            tp.dataToBreak.AppendCodePoint(cp);
                            for (i = tp.dataToBreak.Length - 1; i >= 0 && tp.srcLine[i] == 0; i--)
                            {
                                tp.srcLine[i] = lineNum;
                                tp.srcCol[i]  = column;
                            }

                            break;
                        }


                        // Not a recognized backslash escape sequence.
                        // Take the next char as a literal.
                        //  TODO:  Should this be an error?
                        c       = testString.CodePointAt(charIdx);
                        charIdx = testString.OffsetByCodePoints(charIdx, 1);
                    }

                    // Normal, non-escaped data char.
                    tp.dataToBreak.AppendCodePoint(c);

                    // Save the mapping from offset in the data to line/column numbers in
                    //   the original input file.  Will be used for better error messages only.
                    //   If there's an expected break before this char, the slot in the mapping
                    //     vector will already be set for this char; don't overwrite it.
                    for (i = tp.dataToBreak.Length - 1; i >= 0 && tp.srcLine[i] == 0; i--)
                    {
                        tp.srcLine[i] = lineNum;
                        tp.srcCol[i]  = column;
                    }
                    break;


                case PARSE_NUM:
                    // We are parsing an expected numeric tag value, like <1234>,
                    //   within a chunk of data.
                    if (UCharacter.IsWhitespace(c))
                    {
                        break;
                    }

                    if (c == '>')
                    {
                        // Finished the number.  Add the info to the expected break data,
                        //   and switch parse state back to doing plain data.
                        parseState = PARSE_DATA;
                        if (tagValue == 0)
                        {
                            tagValue = -1;
                        }
                        int breakIdx = tp.dataToBreak.Length;
                        tp.expectedBreaks[breakIdx] = tagValue;
                        tp.srcLine[breakIdx]        = lineNum;
                        tp.srcCol[breakIdx]         = column;
                        break;
                    }

                    if (UCharacter.IsDigit(c))
                    {
                        tagValue = tagValue * 10 + UCharacter.Digit(c);
                        break;
                    }

                    Errln(String.Format("Syntax Error in rbbitst.txt at line {0}, col {1}", lineNum, column));
                    return;
                }
            }

            // Reached end of test file. Raise an error if parseState indicates that we are
            //   within a block that should have been terminated.
            if (parseState == PARSE_RULES)
            {
                Errln(String.Format("rbbitst.txt:{0} <rules> block beginning at line {1} is not closed.",
                                    lineNum, rulesFirstLine));
            }
            if (parseState == PARSE_DATA)
            {
                Errln(String.Format("rbbitst.txt:{0} <data> block not closed.", lineNum));
            }
        }