public static int IndexOfAny(this ReadOnlySpan <char> @this, CharacterSet chars) { for (var searchLocation = 0; searchLocation < @this.Length; searchLocation++) { if (chars.Contains(@this[searchLocation])) { return(searchLocation); } } return(-1); }
public override Core.Tokenization.Token Recognize(string s, int from, bool allowTokenBundles, ref int consumedLength) { if (String.IsNullOrEmpty(s) || from >= s.Length) { return(null); } consumedLength = 0; int originalStart = from; if (_DefaultPunctCharset.Contains(s[from])) { while (from < s.Length && _DefaultPunctCharset.Contains(s[from])) { ++consumedLength; ++from; } Token t = new SimpleToken(s.Substring(originalStart, consumedLength), TokenType.GeneralPunctuation); return(t); } System.Text.RegularExpressions.Match m = _DefaultWordRegex.Match(s, from); if (m != null && m.Success && m.Index == from) { consumedLength = m.Length; Token t = new SimpleToken(m.Value, TokenType.Word); return(t); } /* * AUTOMATON PUNCT [U+3000-U+303FU+3200-U+32FFU+FF01-U+FF0FU+FF1A-U+FF20U+FF3B-U+FF3DU+FF5B-U+FF64] * NFA WORD [U+30A0-U+30FFU+FF65-U+FF9F]+ * NFA WORD [U+3040-U+3091U+3093-U+309F]+ * NFA WORD [U+3092] * NFA WORD [U+4E00-U+9FFF]+ * NFA WORD [U+FF21-U+FF3AU+FF41-U+FF5A]+ */ return(base.Recognize(s, from, allowTokenBundles, ref consumedLength)); }
public override Core.Tokenization.Token Recognize(string s, int from, bool allowTokenBundles, ref int consumedLength) { if (String.IsNullOrEmpty(s) || from >= s.Length) { return(null); } consumedLength = 0; int originalStart = from; if (_DefaultPunctCharset.Contains(s[from])) { while (from < s.Length && _DefaultPunctCharset.Contains(s[from])) { ++consumedLength; ++from; } Token t = new SimpleToken(s.Substring(originalStart, consumedLength), TokenType.GeneralPunctuation); return(t); } if (s[from] >= 0x4e00 && s[from] <= 0x9fff) { while (from < s.Length && s[from] >= 0x4e00 && s[from] <= 0x9fff) { ++consumedLength; ++from; } Token t = new SimpleToken(s.Substring(originalStart, consumedLength), TokenType.CharSequence); return(t); } // TODO CJK punctuation etc. return(base.Recognize(s, from, allowTokenBundles, ref consumedLength)); }
public static void RemoveChars(this StringBuilder @this, CharacterSet chars) { var nextWriteLocation = 0; for (var searchLocation = 0; searchLocation < @this.Length; searchLocation++) { var c = @this[searchLocation]; if (!chars.Contains(c)) { @this[nextWriteLocation] = c; nextWriteLocation++; } } @this.Remove(nextWriteLocation, @this.Length - nextWriteLocation); }
public string ParseCharactersFromCharSet(CharacterSet charSet, bool shouldIncludeChars = true, int maxCount = -1) { if (maxCount == -1) { maxCount = int.MaxValue; } int startIndex = index; // Optimisation from profiling: // Store in temporary local variables // since they're properties that would have to access // the rule stack every time otherwise. int i = index; int li = lineIndex; int count = 0; while (i < _chars.Length && charSet.Contains(_chars [i]) == shouldIncludeChars && count < maxCount) { if (_chars [i] == '\n') { li++; } i++; count++; } index = i; lineIndex = li; int lastCharIndex = index; if (lastCharIndex > startIndex) { return(new string (_chars, startIndex, index - startIndex)); } else { return(null); } }
private float GetScore(string input) { if (string.IsNullOrEmpty(input)) { return(0); } float finalScore; bool cacheHit = scoreCache.TryGetValue(input, out finalScore); if (!cacheHit) { finalScore = ScoreForAbbreviation(input, textBoxCache); scoreCache.Add(input, finalScore); } return(finalScore); // https://github.com/quicksilver/Quicksilver/blob/8be7395b795179cf51cf30ebf82779e0f9ba2138/Quicksilver/Code-QuickStepFoundation/QSSense.m float ScoreForAbbreviation(String str, String abbr) { return(ScoreForAbbreviationWithRanges(str, abbr, new StringRange(0, str.Length), new StringRange(0, abbr.Length))); } float ScoreForAbbreviationWithRanges(String str, String abbr, StringRange strRange, StringRange abbrRange) { const float IGNORED_SCORE = 0.9f; const float SKIPPED_SCORE = 0.15f; if (abbrRange.Length == 0) { return(IGNORED_SCORE); //deduct some points for all remaining letters } if (abbrRange.Length > strRange.Length) { return(0.0f); } float score = 0.0f, remainingScore = 0.0f; int i, j; StringRange matchedRange, remainingStrRange = new StringRange(0, 0), adjustedStrRange = strRange; for (i = abbrRange.Length; i > 0; i--) { //Search for steadily smaller portions of the abbreviation String curAbbr = abbr.Substring(abbrRange.Start, i); int idx = str.IndexOf(curAbbr, adjustedStrRange.Start, adjustedStrRange.Length - abbrRange.Length + i, StringComparison.CurrentCultureIgnoreCase); matchedRange = new StringRange(idx, curAbbr.Length); if (idx == -1) { // not found continue; } remainingStrRange.Start = matchedRange.Start + matchedRange.Length; remainingStrRange.Length = strRange.Start + strRange.Length - remainingStrRange.Start; // Search what is left of the string with the rest of the abbreviation remainingScore = ScoreForAbbreviationWithRanges(str, abbr, remainingStrRange, new StringRange(abbrRange.Start + i, abbrRange.Length - i)); if (remainingScore != 0) { score = remainingStrRange.Start - strRange.Start; // ignore skipped characters if is first letter of a word if (matchedRange.Start > strRange.Start) {//if some letters were skipped if (WordSeperators.Contains(str.ElementAt(matchedRange.Start - 1))) { for (j = matchedRange.Start - 2; j >= strRange.Start; j--) { if (WordSeperators.Contains(str.ElementAt(j))) { score--; } else { score -= SKIPPED_SCORE; } } } else if (Uppercase.Contains(str.ElementAt(matchedRange.Start))) { for (j = matchedRange.Start - 1; j >= strRange.Start; j--) { if (Uppercase.Contains(str.ElementAt(j))) { score--; } else { score -= SKIPPED_SCORE; } } } else { score -= (matchedRange.Start - strRange.Start) / 2; } } score += remainingScore * remainingStrRange.Length; score /= strRange.Length; return(score); } } return(0.0f); } }
public string ParseUntil(ParseRule stopRule, CharacterSet pauseCharacters = null, CharacterSet endCharacters = null) { int ruleId = BeginRule(); CharacterSet pauseAndEnd = new CharacterSet(); if (pauseCharacters != null) { pauseAndEnd.UnionWith(pauseCharacters); } if (endCharacters != null) { pauseAndEnd.UnionWith(endCharacters); } StringBuilder parsedString = new StringBuilder(); object ruleResultAtPause = null; // Keep attempting to parse strings up to the pause (and end) points. // - At each of the pause points, attempt to parse according to the rule // - When the end point is reached (or EOF), we're done do { // TODO: Perhaps if no pause or end characters are passed, we should check *every* character for stopRule? string partialParsedString = ParseUntilCharactersFromCharSet(pauseAndEnd); if (partialParsedString != null) { parsedString.Append(partialParsedString); } // Attempt to run the parse rule at this pause point ruleResultAtPause = Peek(stopRule); // Rule completed - we're done if (ruleResultAtPause != null) { break; } else { if (endOfInput) { break; } // Reached a pause point, but rule failed. Step past and continue parsing string char pauseCharacter = currentCharacter; if (pauseCharacters != null && pauseCharacters.Contains(pauseCharacter)) { parsedString.Append(pauseCharacter); if (pauseCharacter == '\n') { lineIndex++; } index++; continue; } else { break; } } } while(true); if (parsedString.Length > 0) { return((string)SucceedRule(ruleId, parsedString.ToString())); } else { return((string)FailRule(ruleId)); } }
public override Core.Tokenization.Token Recognize(string s, int from, bool allowTokenBundles, ref int consumedLength) { // TODO this could test whether the match is followed by whitespace or non-word characters (punctuation etc) - // that would be simpler than inlcuding the respective constraints in the RX Token winner = null; int winningLength = 0; int winningPriority = 0; for (int p = 0; p < _Patterns.Count; ++p) { System.Text.RegularExpressions.Regex rx = _Patterns[p].Regex; CharacterSet first = _Patterns[p].First; // NOTE if the requirement that m.Index == from is dropped, we cannot use FIRST any more if (first != null && from < s.Length && !first.Contains(s[from])) { continue; } System.Text.RegularExpressions.Match m = rx.Match(s, from); if (m != null && m.Success && m.Index == from) { if (VerifyContextConstraints(s, m.Index + m.Value.Length)) { Token t = CreateToken(m.Value, m.Groups); // TODO set other token values? if (t != null && m.Length > 0) { // longest wins, if two matches are found with equal length, the one with // higher prio wins, or if both have same prio, first match wins if ((m.Length > winningLength) || winner == null || (m.Length == winningLength && _Patterns[p].Priority > winningPriority && !allowTokenBundles)) { winningLength = m.Length; winner = t; winningPriority = _Patterns[p].Priority; } else if (allowTokenBundles && m.Length == winningLength) { if (!(winner is TokenBundle)) { winner = new TokenBundle(winner, winningPriority); } ((TokenBundle)winner).Add(t, _Patterns[p].Priority); winningPriority = Math.Max(winningPriority, _Patterns[p].Priority); } } } } } if (winner != null) { consumedLength = winningLength; return(winner); } else { return(null); } }