/// <seealso cref="Transliterator.AddSourceTargetSet(UnicodeSet, UnicodeSet, UnicodeSet)"/> public override void AddSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { UnicodeSet myFilter = GetFilterAsUnicodeSet(inputFilter); if (!myFilter.ContainsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.Contains(CLOSE_DELIM)) { return; // we have to contain both prefix and suffix } UnicodeSet items = new UnicodeSet() .AddAll('0', '9') .AddAll('A', 'F') .AddAll('a', 'z') // for controls .Add('<').Add('>') // for controls .Add('(').Add(')') // for controls .Add('-') .Add(' ') .AddAll(UnicodeNameTransliterator.OPEN_DELIM) .Add(CLOSE_DELIM); items.RetainAll(myFilter); if (items.Count > 0) { sourceSet.AddAll(items); // could produce any character targetSet.AddAll(0, 0x10FFFF); } }
private bool IsSubsetOfInternal(UnicodeSet other) { foreach (var item in this) { if (!other.Contains(item)) { return(false); } } return(true); }
/// <summary> /// Update the set of unhandled characters for the specified breakType to include /// all that have the same script as <paramref name="c"/>. /// May be called concurrently with <see cref="Handles(int, int)"/> or <see cref="FindBreaks(CharacterIterator, int, int, int, DictionaryBreakEngine.DequeI)"/>. /// Must not be called concurrently with itself. /// </summary> public void HandleChar(int c, int breakType) { if (breakType >= 0 && breakType < fHandled.Length && c != CharacterIteration.Done32) { UnicodeSet originalSet = fHandled[breakType]; if (!originalSet.Contains(c)) { int script = UChar.GetIntPropertyValue(c, UProperty.Script); UnicodeSet newSet = new UnicodeSet(); newSet.ApplyInt32PropertyValue(UProperty.Script, script); newSet.AddAll(originalSet); fHandled[breakType] = newSet; } } }
public int FindBreaks(CharacterIterator text, int startPos, int endPos, int breakType, DictionaryBreakEngine.DequeI foundBreaks) { if (breakType >= 0 && breakType < fHandled.Length) { UnicodeSet uniset = fHandled[breakType]; int c = CharacterIteration.Current32(text); while (text.Index < endPos && uniset.Contains(c)) { CharacterIteration.Next32(text); c = CharacterIteration.Current32(text); } } return(0); }
/// <summary> /// Find the source and target sets, subject to the input filter. /// There is a known issue with filters containing multiple characters. /// </summary> // TODO: Problem: the rule is [{ab}]c > x // The filter is [a{bc}]. // If the input is abc, then the rule will work. // However, following code applying the filter won't catch that case. internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) { int limit = anteContextLength + keyLength; UnicodeSet tempSource = new UnicodeSet(); UnicodeSet temp = new UnicodeSet(); // We need to walk through the pattern. // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo for (int i = anteContextLength; i < limit;) { int ch = UTF16.CharAt(pattern, i); i += UTF16.GetCharCount(ch); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { if (!filter.Contains(ch)) { return; } tempSource.Add(ch); } else { try { if (!filter.ContainsSome((UnicodeSet)matcher)) { return; } matcher.AddMatchSetTo(tempSource); } catch (InvalidCastException) { // if the matcher is not a UnicodeSet temp.Clear(); matcher.AddMatchSetTo(temp); if (!filter.ContainsSome(temp)) { return; } tempSource.AddAll(temp); } } } // if we made our way through the gauntlet, add to source/target sourceSet.AddAll(tempSource); output.AddReplacementSetTo(targetSet); }
//------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ internal virtual void Build() { RangeDescriptor rlRange; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets", StringComparison.Ordinal) >= 0) { PrintSets(); } // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(); fRangeList.fStartChar = 0; fRangeList.fEndChar = 0x10ffff; // // Find the set of non-overlapping ranges of characters // foreach (RBBINode usetNode in fRB.fUSetNodes) { UnicodeSet inputSet = usetNode.fInputSet; int inputSetRangeCount = inputSet.RangeCount; int inputSetRangeIndex = 0; rlRange = fRangeList; for (; ;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } int inputSetRangeBegin = inputSet.GetRangeStart(inputSetRangeIndex); int inputSetRangeEnd = inputSet.GetRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange.fEndChar < inputSetRangeBegin) { rlRange = rlRange.fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange.fStartChar < inputSetRangeBegin) { rlRange.Split(inputSetRangeBegin); continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange.fEndChar > inputSetRangeEnd) { rlRange.Split(inputSetRangeEnd + 1); } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange.fIncludesSets.IndexOf(usetNode) == -1) { rlRange.fIncludesSets.Add(usetNode); } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange.fEndChar) { inputSetRangeIndex++; } rlRange = rlRange.fNext; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range", StringComparison.Ordinal) >= 0) { PrintRanges(); } // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RangeDescriptor rlSearchRange; for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext) { if (ListEqualityComparer <RBBINode> .Default.Equals(rlRange.fIncludesSets, rlSearchRange.fIncludesSets)) { rlRange.fNum = rlSearchRange.fNum; break; } } if (rlRange.fNum == 0) { fGroupCount++; rlRange.fNum = fGroupCount + 2; rlRange.SetDictionaryFlag(); AddValToSets(rlRange.fIncludesSets, fGroupCount + 2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. string eofString = "eof"; string bofString = "bof"; foreach (RBBINode usetNode in fRB.fUSetNodes) { UnicodeSet inputSet = usetNode.fInputSet; if (inputSet.Contains(eofString)) { AddValToSet(usetNode, 1); } if (inputSet.Contains(bofString)) { AddValToSet(usetNode, 2); fSawBOF = true; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup", StringComparison.Ordinal) >= 0) { PrintRangeGroups(); } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets", StringComparison.Ordinal) >= 0) { PrintSets(); } fTrie = new Trie2Writable(0, // Initial value for all code points. 0); // Error value for out-of-range input. for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { fTrie.SetRange( rlRange.fStartChar, // Range start rlRange.fEndChar, // Range end (inclusive) rlRange.fNum, // value for range true // Overwrite previously written values ); } }
public override int DivideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, DequeI foundBreaks) { if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) { return(0); // Not enough characters for word } int wordsFound = 0; int wordLength; int current; PossibleWord[] words = new PossibleWord[BURMESE_LOOKAHEAD]; for (int i = 0; i < BURMESE_LOOKAHEAD; i++) { words[i] = new PossibleWord(); } int uc; fIter.SetIndex(rangeStart); while ((current = fIter.Index) < rangeEnd) { wordLength = 0; //Look for candidate words at the current position int candidates = words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd); // If we found exactly one, use that if (candidates == 1) { wordLength = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter); wordsFound += 1; } // If there was more than one, see which one can take us forward the most words else if (candidates > 1) { bool foundBest = false; // If we're already at the end of the range, we're done if (fIter.Index < rangeEnd) { do { int wordsMatched = 1; if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0) { if (wordsMatched < 2) { // Followed by another dictionary word; mark first word as a good candidate words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent(); wordsMatched = 2; } // If we're already at the end of the range, we're done if (fIter.Index >= rangeEnd) { break; } // See if any of the possible second words is followed by a third word do { // If we find a third word, stop right away if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0) { words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent(); foundBest = true; break; } } while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].BackUp(fIter)); } } while (words[wordsFound % BURMESE_LOOKAHEAD].BackUp(fIter) && !foundBest); } wordLength = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter); wordsFound += 1; } // We come here after having either found a word or not. We look ahead to the // next word. If it's not a dictionary word, we will combine it with the word we // just found (if there is one), but only if the preceding word does not exceed // the threshold. // The text iterator should now be positioned at the end of the word we found. if (fIter.Index < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { // If it is a dictionary word, do nothing. If it isn't, then if there is // no preceding word, or the non-word shares less than the minimum threshold // of characters with a dictionary word, then scan to resynchronize if (words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) <= 0 && (wordLength == 0 || words[wordsFound % BURMESE_LOOKAHEAD].LongestPrefix < BURMESE_PREFIX_COMBINE_THRESHOLD)) { // Look for a plausible word boundary int remaining = rangeEnd - (current + wordLength); int pc = fIter.Current; int chars = 0; for (; ;) { fIter.Next(); uc = fIter.Current; chars += 1; if (--remaining <= 0) { break; } if (fEndWordSet.Contains(pc) && fBeginWordSet.Contains(uc)) { // Maybe. See if it's in the dictionary. int candidate = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd); fIter.SetIndex(current + wordLength + chars); if (candidate > 0) { break; } } pc = uc; } // Bump the word count if there wasn't already one if (wordLength <= 0) { wordsFound += 1; } // Update the length with the passed-over characters wordLength += chars; } else { // Backup to where we were for next iteration fIter.SetIndex(current + wordLength); } } // Never stop before a combining mark. int currPos; while ((currPos = fIter.Index) < rangeEnd && fMarkSet.Contains(fIter.Current)) { fIter.Next(); wordLength += fIter.Index - currPos; } // Look ahead for possible suffixes if a dictionary word does not follow. // We do this in code rather than using a rule so that the heuristic // resynch continues to function. For example, one of the suffix characters // could be a typo in the middle of a word. // NOT CURRENTLY APPLICABLE TO BURMESE // Did we find a word on this iteration? If so, push it on the break stack if (wordLength > 0) { foundBreaks.Push(current + wordLength); } } // Don't return a break for the end of the dictionary range if there is one there if (foundBreaks.Peek() >= rangeEnd) { foundBreaks.Pop(); wordsFound -= 1; } return(wordsFound); }
public virtual bool Handles(int c, int breakType) { return(fTypes.SafeGet(breakType) && // this type can use us fSet.Contains(c)); // we recognize the character }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { int maxLen = UCharacterName.Instance.MaxCharNameLength + 1; // allow for temporary trailing space StringBuffer name = new StringBuffer(maxLen); // Get the legal character set UnicodeSet legal = new UnicodeSet(); UCharacterName.Instance.GetCharNameCharacters(legal); int cursor = offsets.Start; int limit = offsets.Limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int mode = 0; int openPos = -1; // open delim candidate pos int c; while (cursor < limit) { c = text.Char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) { // quick check first openPos = cursor; int i = Utility.ParsePattern(OPEN_PAT, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.Length = 0; cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (PatternProps.IsWhiteSpace(c)) { // Ignore leading whitespace if (name.Length > 0 && name[name.Length - 1] != SPACE) { name.Append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.Length > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int len = name.Length; // Delete trailing space, if any if (len > 0 && name[len - 1] == SPACE) { name.Length = --len; } c = UCharacter.GetCharFromExtendedName(name.ToString()); if (c != -1) { // Lookup succeeded // assert(UTF16.getCharCount(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM string str = UTF16.ValueOf(c); text.Replace(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int delta = cursor - openPos - str.Length; cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } if (legal.Contains(c)) { UTF16.Append(name, c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.Length >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += UTF16.GetCharCount(c); } offsets.ContextLimit += limit - offsets.Limit; offsets.Limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.Start = (isIncremental && openPos >= 0) ? openPos : cursor; }
public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks) { if (startPos >= endPos) { return(0); } inText.SetIndex(startPos); int inputLength = endPos - startPos; int[] charPositions = new int[inputLength + 1]; StringBuffer s = new StringBuffer(""); inText.SetIndex(startPos); while (inText.Index < endPos) { s.Append(inText.Current); inText.Next(); } string prenormstr = s.ToString(); #pragma warning disable 612, 618 bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes || Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0); #pragma warning restore 612, 618 CharacterIterator text; int numChars = 0; if (isNormalized) { text = new StringCharacterIterator(prenormstr); int index = 0; charPositions[0] = 0; while (index < prenormstr.Length) { int codepoint = prenormstr.CodePointAt(index); index += Character.CharCount(codepoint); numChars++; charPositions[numChars] = index; } } else { #pragma warning disable 612, 618 string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC); text = new StringCharacterIterator(normStr); charPositions = new int[normStr.Length + 1]; Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0); int index = 0; charPositions[0] = 0; while (index < normalizer.EndIndex) { normalizer.Next(); numChars++; index = normalizer.Index; charPositions[numChars] = index; } #pragma warning restore 612, 618 } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. int[] bestSnlp = new int[numChars + 1]; bestSnlp[0] = 0; for (int i = 1; i <= numChars; i++) { bestSnlp[i] = kint32max; } int[] prev = new int[numChars + 1]; for (int i = 0; i <= numChars; i++) { prev[i] = -1; } int maxWordSize = 20; int[] values = new int[numChars]; int[] lengths = new int[numChars]; // dynamic programming to find the best segmentation bool is_prev_katakana = false; for (int i = 0; i < numChars; i++) { text.SetIndex(i); if (bestSnlp[i] == kint32max) { continue; } int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); int[] count_ = new int[1]; fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); int count = count_[0]; // if there are no single character matches found in the dictionary // starting with this character, treat character as a 1-character word // with the highest value possible (i.e. the least likely to occur). // Exclude Korean characters from this treatment, as they should be // left together by default. text.SetIndex(i); // fDictionary.matches() advances the text position; undo that. if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text))) { values[count] = maxSnlp; lengths[count] = 1; count++; } for (int j = 0; j < count; j++) { int newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, single-character Katakana words are pretty rare. // So we apply the following heuristic to Katakana: any continuous // run of Katakana characters is considered a candidate word with // a default cost specified in the katakanaCost table according // to its length. bool is_katakana = IsKatakana(CharacterIteration.Current32(text)); if (!is_prev_katakana && is_katakana) { int j = i + 1; CharacterIteration.Next32(text); while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text))) { CharacterIteration.Next32(text); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } int[] t_boundary = new int[numChars + 1]; int numBreaks = 0; if (bestSnlp[numChars] == kint32max) { t_boundary[numBreaks] = numChars; numBreaks++; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; numBreaks++; } Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0); } if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos) { t_boundary[numBreaks++] = 0; } int correctedNumBreaks = 0; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; if (!(foundBreaks.Contains(pos) || pos == startPos)) { foundBreaks.Push(charPositions[t_boundary[i]] + startPos); correctedNumBreaks++; } } if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos) { foundBreaks.Pop(); correctedNumBreaks--; } if (!foundBreaks.IsEmpty) { inText.SetIndex(foundBreaks.Peek()); } return(correctedNumBreaks); }