/// <summary> /// 16进制 To 二进制 /// </summary> /// <param name="data">字符串</param> /// <returns>byte[]数据</returns> public static byte[] HexToBinary(string data) { if ((data == null) || ((data.Length % 2) != 0)) { return(null); } byte[] buffer = new byte[data.Length / 2]; for (int i = 0; i < buffer.Length; i++) { int num2 = CharUtil.HexToInt(data[2 * i]); int num3 = CharUtil.HexToInt(data[(2 * i) + 1]); if ((num2 == -1) || (num3 == -1)) { return(null); } buffer[i] = (byte)((num2 << 4) | num3); } return(buffer); }
static string?ConvertTrio(int val, int group, bool?femenine) { string trio = val.ToString("000"); int cent = CharUtil.ToInt(trio[0]); int dec = CharUtil.ToInt(trio[1]); int unit = CharUtil.ToInt(trio[2]); if (cent == 0 && dec == 0 && unit == 0 && group % 2 == 1) { return(null); } string?groupName = UnitsGroup(group, unit != 1 || dec > 0 || cent > 0); string num = CentsDecsUnits(cent, dec, unit, group >= 2 ? null : femenine); return(" ".Combine(val == 1 && groupName == "mil" ? null : num, groupName)); }
public static long GetContentLength(IHttpMessage message) { if (message.Headers.TryGet(HttpHeaderNames.ContentLength, out ICharSequence value)) { return(CharUtil.ParseLong(value)); } // We know the content length if it's a Web Socket message even if // Content-Length header is missing. long webSocketContentLength = GetWebSocketContentLength(message); if (webSocketContentLength >= 0) { return(webSocketContentLength); } // Otherwise we don't. return(ThrowHelper.FromFormatException_HeaderNotFound()); }
public static long GetContentLength(IHttpMessage message, long defaultValue) { if (message.Headers.TryGet(HttpHeaderNames.ContentLength, out ICharSequence value)) { return(CharUtil.ParseLong(value)); } // We know the content length if it's a Web Socket message even if // Content-Length header is missing. long webSocketContentLength = GetWebSocketContentLength(message); if (webSocketContentLength >= 0) { return(webSocketContentLength); } // Otherwise we don't. return(defaultValue); }
static bool SkipControlCharacters(IByteBuffer buffer) { bool skiped = false; int wIdx = buffer.WriterIndex; int rIdx = buffer.ReaderIndex; while (wIdx > rIdx) { byte c = buffer.GetByte(rIdx++); if (!CharUtil.IsISOControl(c) && !IsWhiteSpace(c)) { rIdx--; skiped = true; break; } } buffer.SetReaderIndex(rIdx); return(skiped); }
// private static readonly int escapeForwardSlash = -1; private async void _AppendEscapedString(TextWriter writer, string primStr) { if (primStr != null && primStr.Length > 0) { char[] primChars = primStr.ToCharArray(); // char prevEc = 0; foreach (char ec in primChars) { if (Symbols.IsEscapedChar(ec)) { // if(prevEc == '<' && ec == '/') { // // if(escapeForwardSlash >= 0) { // // await writer.WriteAsync("\\/"); // // } else { // await writer.WriteAsync("/"); // // } // } else { // string str = Symbols.GetEscapedCharString(ec, escapeForwardSlash > 0 ? true : false); string str = Symbols.GetEscapedCharString(ec, false); if (str != null) { await writer.WriteAsync(str); } else { // ??? await writer.WriteAsync(ec); } // } } else if (CharUtil.IsISOControl(ec)) { char[] uc = UnicodeUtil.GetUnicodeHexCodeFromChar(ec); await writer.WriteAsync(uc); } else { await writer.WriteAsync(ec); } // prevEc = ec; } } }
/// <summary> /// 文字ループ。 /// </summary> public IEnumerable <ICharMetric> EnumerateCharMetrics(IReadOnlyRegion2D clipRegion) { foreach (var lineMetric in EnumerateLineMetrics(clipRegion)) { foreach (var charMetric in EnumerateCharMetricsByLine(lineMetric.Index, true, true)) { Debug.Assert(CharUtil.CanDraw(charMetric.Char)); if (clipRegion.IntersectsWith(charMetric.CharRect)) { yield return(charMetric); } if (charMetric.Right > clipRegion.Bounds.Right) { break; } } } }
// Creates a new HTTP method with the specified name. You will not need to // create a new method unless you are implementing a protocol derived from // HTTP, such as // http://en.wikipedia.org/wiki/Real_Time_Streaming_Protocol and // http://en.wikipedia.org/wiki/Internet_Content_Adaptation_Protocol // public HttpMethod(string name) { Contract.Requires(name != null); name = name.Trim(); if (string.IsNullOrEmpty(name)) { throw new ArgumentException(nameof(name)); } for (int i = 0; i < name.Length; i++) { char c = name[i]; if (CharUtil.IsISOControl(c) || char.IsWhiteSpace(c)) { throw new ArgumentException($"Invalid character '{c}' in {nameof(name)}"); } } this.name = AsciiString.Cached(name); }
private char GobbleUpSpace() { char c = (char)0; try { c = PeekChar(); //while(c != 0 && char.isSpaceChar(c)) { // ??? -> this doesn't seem to work.... // while(c != 0 && char.IsWhitespace(c) ) { // ??? while (c != 0 && CharUtil.IsWhitespace(c)) // ??? // NextChar(); // gobble up space. // c = PeekChar(); { c = SkipAndPeekChar(); } } catch (DotJsonMiniException e) { // ???? System.Diagnostics.Debug.WriteLine("Failed to consume space.", e); c = (char)0; } return(c); }
HttpVersion(string protocolName, int majorVersion, int minorVersion, bool keepAliveDefault, bool bytes) { if (protocolName == null) { throw new ArgumentException(nameof(protocolName)); } protocolName = protocolName.Trim().ToUpper(); if (string.IsNullOrEmpty(protocolName)) { throw new ArgumentException("empty protocolName"); } // ReSharper disable once ForCanBeConvertedToForeach for (int i = 0; i < protocolName.Length; i++) { char c = protocolName[i]; if (CharUtil.IsISOControl(c) || char.IsWhiteSpace(c)) { throw new ArgumentException($"invalid character {c} in protocolName"); } } if (majorVersion < 0) { throw new ArgumentException("negative majorVersion"); } if (minorVersion < 0) { throw new ArgumentException("negative minorVersion"); } this.protocolName = protocolName; this.majorVersion = majorVersion; this.minorVersion = minorVersion; this.text = new AsciiString(protocolName + '/' + majorVersion + '.' + minorVersion); this.keepAliveDefault = keepAliveDefault; this.bytes = bytes ? this.text.Array : null; }
/// <summary> /// 次の単語へ移動。 /// </summary> public void MoveNextWord(bool isSelect) { if (Caret.Pos == CurrentLineEndPos) { MoveRight(isSelect); return; } var text = CurrentLine.Text; var charIndex = Caret.Pos.CharIndex + 1; while (charIndex < text.Length && CharUtil.EqualsWordGroup(text[charIndex - 1], text[charIndex])) { charIndex++; } while (charIndex < text.Length && Char.IsWhiteSpace(text[charIndex])) { charIndex++; } SetCaretPos(new TextPos(Caret.Pos.LineIndex, charIndex), isSelect, true); }
/// <summary> /// 前の単語へ移動。 /// </summary> public void MovePrevWord(bool isSelect) { if (Caret.Pos == CurrentLineTopPos) { MoveLeft(isSelect); return; } var text = CurrentLine.Text; var charIndex = Caret.Pos.CharIndex - 1; while (0 < charIndex && Char.IsWhiteSpace(text[charIndex])) { charIndex--; } while (0 < charIndex && CharUtil.EqualsWordGroup(text[charIndex], text[charIndex - 1])) { charIndex--; } SetCaretPos(new TextPos(Caret.Pos.LineIndex, charIndex), isSelect, true); }
//------------------------------------------------------------ // IOUtil.IsPathRooted // /// <summary> /// <para>>Path.IsPathRooted determines /// that a relative path with a drive name is an absolute path. /// For example, it returns true for @"C:User\temp.txt". (CLS 2.0)</para> /// <para>This method</para> /// </summary> //------------------------------------------------------------ static internal bool IsPathRooted(string path) { if (!Path.IsPathRooted(path)) { return(false); } int i = 0; while (Char.IsWhiteSpace(path[i])) { ++i; } if (path.Length - i >= 3 && CharUtil.IsAsciiAlphabet(path[i]) && path[i + 1] == Path.VolumeSeparatorChar && path[i + 2] != Path.DirectorySeparatorChar) { return(false); } return(true); }
protected override List <object> _Filter(List <object> _unfilteredData, string _displayMember, string userInput) { int count = userInput.Length - 1; List <object> results = new List <object>(); string userLetters = userInput.ToLower(); foreach (var obj in _unfilteredData) { var itemLetters = GetFilterString(obj, _displayMember).ToLower(); if (count < itemLetters.Length) { char currentLetter = itemLetters[count]; //Match isch = Regex.Match(currentLetter.ToString(), @"[\u4e00-\u9fa5]"); // 如果是中文 if (CharUtil.IsChineseLetter(currentLetter)) { // (首字母) // c = zhong // 注: pinyin 有多种发音的,每一种都可以 List <string> pinyins = PinyinUtil.Chinese2Pinyin(currentLetter); foreach (var pinyin in pinyins) { if (pinyin.ToLower()[0] == userLetters[count]) { results.Add(obj); // 找到匹配的了 break; } } continue; } } } return(results); }
// Returns the next peeked character. // Return value of 0 means we have reached the end of the json string. // TBD: use "look ahead" implementation similar to readString() ???? private char GobbleUpSpace() { char c = (char)0; try { c = PeekChar(); //while(c != 0 && Character.isSpaceChar(c)) { // ??? -> this doesn't seem to work.... // while(c != 0 && Character.isWhitespace(c) ) { // ??? while (c != 0 && CharUtil.IsWhitespace(c)) // ??? // nextChar(); // gobble up space. // c = PeekChar(); { c = SkipAndPeekChar(); } } catch (JsonTokenizerException e) { // ???? //if (log.isLoggable(Level.INFO)) { // // log.log(Level.INFO, "Failed to consume space: " + ErrorContext.buildContextString(GetTailCharStream(), PeekCharStream()), e); //} c = (char)0; } return(c); }
/// <summary> /// Creates a new HTTP method with the specified name. You will not need to /// create a new method unless you are implementing a protocol derived from /// HTTP, such as /// http://en.wikipedia.org/wiki/Real_Time_Streaming_Protocol and /// http://en.wikipedia.org/wiki/Internet_Content_Adaptation_Protocol /// </summary> /// <param name="name"></param> public HttpMethod(string name) { if (name is null) { ThrowHelper.ThrowArgumentNullException(ExceptionArgument.name); } name = name.Trim(); if (string.IsNullOrEmpty(name)) { ThrowHelper.ThrowArgumentNullException(ExceptionArgument.name); } for (int i = 0; i < name.Length; i++) { char c = name[i]; if (CharUtil.IsISOControl(c) || char.IsWhiteSpace(c)) { ThrowHelper.ThrowArgumentException_InvalidMethodName(c, name); } } this.name = AsciiString.Cached(name); }
public void AddAlpha_PastUpperBound() { Assert.AreEqual('z', CharUtil.AlphaAdd(1, 'z')); Assert.AreEqual('Z', CharUtil.AlphaAdd(1, 'Z')); }
/// <summary> /// /// </summary> private IEnumerable <ICharMetric> EnumerateCharMetricsByLine(int lineIndex, bool isTabProcess, bool isAddMetaChar) { var chars = _textView.Doc.Lines[lineIndex].Text.AsEnumerable(); if (isAddMetaChar) { if (lineIndex < _textView.Doc.Lines.LastIndex()) { chars = chars.Append(CharUtil.Space); } DebugUtil.DebugCode(() => { if (lineIndex < _textView.Doc.Lines.LastIndex()) { chars = chars.SkipLast(1).Append('↓'); } else { chars = chars.Concat("[EOF]"); } }); } var lineHeight = _textView.GetLineHeight(); var columnCounter = new ColumnCounter(_textView._settings.tabWidth); var originalIndex = 0; var index = 0; var lineLength = _textView.Doc.Lines[lineIndex].Length; var x = 0.0; var chItem = new CharMetric(); foreach (var ch in chars) { columnCounter.Add(ch); var sourceChar = ch; var color = Color.Black; var backgroundColor = Color.Transparent; #warning test //if (!CharUtil.CanDraw(ch) && ch != CharUtil.Tab) //{ // // バイナリファイルを開いた場合でも、それとなく表示できるようにしておく // ch = '?'; //} if (sourceChar == CharUtil.Tab && isTabProcess) { sourceChar = CharUtil.Space; for (var i = 0; i < columnCounter.LastCharCount; i++) { DebugUtil.DebugCode(() => { sourceChar = i == 0 ? '>' : '.'; color = Color.DarkCyan; backgroundColor = Color.FromRgb(0x00aaaa); }); var width = _textView.AsciiFont.MeasureChar(sourceChar).Width; yield return(chItem.Init( lineIndex, lineHeight, sourceChar, x, index, originalIndex, 1, color, backgroundColor, _textView.AsciiFont, width, _textView._settings.lineHeightAdjust / 2.0)); x += width; index++; } } else { DebugUtil.DebugCode(() => { if (sourceChar == CharUtil.FullWidthSpace) { sourceChar = '□'; color = Color.DarkCyan; } if (originalIndex >= lineLength) { color = Color.DarkCyan; } }); var tempChar = (sourceChar == CharUtil.Tab) ? CharUtil.Space : sourceChar; var font = CharUtil.IsAscii(tempChar) ? _textView.AsciiFont : _textView.JpFont; var width = font.MeasureChar(tempChar).Width *columnCounter.LastCharCount; yield return(chItem.Init( lineIndex, lineHeight, sourceChar, x, index, originalIndex, columnCounter.LastColumnCount, color, backgroundColor, font, width, _textView._settings.lineHeightAdjust / 2.0)); x += width; index++; } originalIndex++; } }
private ILastHttpContent ReadTrailingHeaders(IByteBuffer buffer) { AppendableCharSequence line = _headerParser.Parse(buffer); if (line is null) { return(null); } ILastHttpContent trailingHeaders = _trailer; if (0u >= (uint)line.Count && trailingHeaders is null) { // We have received the empty line which signals the trailer is complete and did not parse any trailers // before. Just return an empty last content to reduce allocations. return(EmptyLastHttpContent.Default); } AsciiString lastHeader = null; if (trailingHeaders is null) { trailingHeaders = new DefaultLastHttpContent(Unpooled.Empty, ValidateHeaders); _trailer = trailingHeaders; } while ((uint)line.Count > 0u) { byte firstChar = line.Bytes[0]; if (lastHeader is object && (firstChar == c_space || firstChar == c_tab)) { IList <ICharSequence> current = trailingHeaders.TrailingHeaders.GetAll(lastHeader); if ((uint)current.Count > 0u) { int lastPos = current.Count - 1; //please do not make one line from below code //as it breaks +XX:OptimizeStringConcat optimization ICharSequence lineTrimmed = CharUtil.Trim(line); current[lastPos] = new AsciiString($"{current[lastPos]}{lineTrimmed}"); } } else { SplitHeader(line); AsciiString headerName = _name; if (!HttpHeaderNames.ContentLength.ContentEqualsIgnoreCase(headerName) && !HttpHeaderNames.TransferEncoding.ContentEqualsIgnoreCase(headerName) && !HttpHeaderNames.Trailer.ContentEqualsIgnoreCase(headerName)) { _ = trailingHeaders.TrailingHeaders.Add(headerName, _value); } lastHeader = _name; // reset name and value fields _name = null; _value = null; } line = _headerParser.Parse(buffer); if (line is null) { return(null); } } _trailer = null; return(trailingHeaders); }
State?ReadHeaders(IByteBuffer buffer) { IHttpMessage httpMessage = _message; HttpHeaders headers = httpMessage.Headers; AppendableCharSequence line = _headerParser.Parse(buffer); if (line is null) { return(null); } // ReSharper disable once ConvertIfDoToWhile if ((uint)line.Count > 0u) { do { byte firstChar = line.Bytes[0]; if (_name is object && (firstChar == c_space || firstChar == c_tab)) { //please do not make one line from below code //as it breaks +XX:OptimizeStringConcat optimization ICharSequence trimmedLine = CharUtil.Trim(line); _value = new AsciiString($"{_value} {trimmedLine}"); } else { if (_name is object) { _ = headers.Add(_name, _value); } SplitHeader(line); } line = _headerParser.Parse(buffer); if (line is null) { return(null); } } while ((uint)line.Count > 0u); } // Add the last header. if (_name is object) { _ = headers.Add(_name, _value); } // reset name and value fields _name = null; _value = null; var values = headers.GetAll(HttpHeaderNames.ContentLength); uint contentLengthValuesCount = (uint)values.Count; if (contentLengthValuesCount > 0u) { // Guard against multiple Content-Length headers as stated in // https://tools.ietf.org/html/rfc7230#section-3.3.2: // // If a message is received that has multiple Content-Length header // fields with field-values consisting of the same decimal value, or a // single Content-Length header field with a field value containing a // list of identical decimal values (e.g., "Content-Length: 42, 42"), // indicating that duplicate Content-Length header fields have been // generated or combined by an upstream message processor, then the // recipient MUST either reject the message as invalid or replace the // duplicated field-values with a single valid Content-Length field // containing that decimal value prior to determining the message body // length or forwarding the message. if (contentLengthValuesCount > 1u && httpMessage.ProtocolVersion == HttpVersion.Http11) { ThrowHelper.ThrowArgumentException_Multiple_Content_Length_Headers_Found(); } if (!long.TryParse(values[0].ToString(), out _contentLength)) { ThrowHelper.ThrowArgumentException_Invalid_Content_Length(); } } if (IsContentAlwaysEmpty(httpMessage)) { HttpUtil.SetTransferEncodingChunked(httpMessage, false); return(State.SkipControlChars); } else if (HttpUtil.IsTransferEncodingChunked(httpMessage)) { if (contentLengthValuesCount > 0u && httpMessage.ProtocolVersion == HttpVersion.Http11) { HandleTransferEncodingChunkedWithContentLength(httpMessage); } return(State.ReadChunkSize); } else if (ContentLength() >= 0L) { return(State.ReadFixedLengthContent); } else { return(State.ReadVariableLengthContent); } }
private void Decode(string header, ICollection <ICookie> cookies) { int headerLen = header.Length; int i = 0; bool rfc2965Style = false; if (CharUtil.RegionMatchesIgnoreCase(header, 0, RFC2965Version, 0, RFC2965Version.Count)) { // RFC 2965 style cookie, move to after version value i = header.IndexOf(';') + 1; rfc2965Style = true; } // loop while (true) { // Skip spaces and separators. while (true) { if (i == headerLen) { goto loop; } char c = header[i]; if (IsSpace(c)) { i++; continue; } break; } int nameBegin = i; int nameEnd; int valueBegin; int valueEnd; while (true) { char curChar = header[i]; switch (curChar) { case HttpConstants.SemicolonChar: // NAME; (no value till ';') nameEnd = i; valueBegin = valueEnd = -1; goto loop0; case HttpConstants.EqualsSignChar: // NAME=VALUE nameEnd = i; i++; if (i == headerLen) { // NAME= (empty value, i.e. nothing after '=') valueBegin = valueEnd = 0; goto loop0; } valueBegin = i; // NAME=VALUE; int semiPos = header.IndexOf(';', i); valueEnd = i = semiPos > 0 ? semiPos : headerLen; goto loop0; default: i++; break; } if (i == headerLen) { // NAME (no value till the end of string) nameEnd = headerLen; valueBegin = valueEnd = -1; break; } } loop0: if (rfc2965Style && (CharUtil.RegionMatches(header, nameBegin, RFC2965Path, 0, RFC2965Path.Count) || CharUtil.RegionMatches(header, nameBegin, RFC2965Domain, 0, RFC2965Domain.Count) || CharUtil.RegionMatches(header, nameBegin, RFC2965Port, 0, RFC2965Port.Count))) { // skip obsolete RFC2965 fields continue; } DefaultCookie cookie = InitCookie(header, nameBegin, nameEnd, valueBegin, valueEnd); if (cookie is object) { cookies.Add(cookie); } } loop: return; }
public ISet <ICookie> Decode(string header) { Contract.Requires(header != null); int headerLen = header.Length; if (headerLen == 0) { return(Empty); } var cookies = new SortedSet <ICookie>(); int i = 0; bool rfc2965Style = false; if (CharUtil.RegionMatchesIgnoreCase(header, 0, RFC2965Version, 0, RFC2965Version.Count)) { // RFC 2965 style cookie, move to after version value i = header.IndexOf(';') + 1; rfc2965Style = true; } // loop for (;;) { // Skip spaces and separators. for (;;) { if (i == headerLen) { goto loop; } char c = header[i]; if (c == '\t' || c == '\n' || c == 0x0b || c == '\f' || c == '\r' || c == ' ' || c == ',' || c == ';') { i++; continue; } break; } int nameBegin = i; int nameEnd; int valueBegin; int valueEnd; for (;;) { char curChar = header[i]; if (curChar == ';') { // NAME; (no value till ';') nameEnd = i; valueBegin = valueEnd = -1; break; } else if (curChar == '=') { // NAME=VALUE nameEnd = i; i++; if (i == headerLen) { // NAME= (empty value, i.e. nothing after '=') valueBegin = valueEnd = 0; break; } valueBegin = i; // NAME=VALUE; int semiPos = header.IndexOf(';', i); valueEnd = i = semiPos > 0 ? semiPos : headerLen; break; } else { i++; } if (i == headerLen) { // NAME (no value till the end of string) nameEnd = headerLen; valueBegin = valueEnd = -1; break; } } if (rfc2965Style && (CharUtil.RegionMatches(header, nameBegin, RFC2965Path, 0, RFC2965Path.Count) || CharUtil.RegionMatches(header, nameBegin, RFC2965Domain, 0, RFC2965Domain.Count) || CharUtil.RegionMatches(header, nameBegin, RFC2965Port, 0, RFC2965Port.Count))) { // skip obsolete RFC2965 fields continue; } DefaultCookie cookie = this.InitCookie(header, nameBegin, nameEnd, valueBegin, valueEnd); if (cookie != null) { cookies.Add(cookie); } } loop: return(cookies); }
// Fast-Path implementation internal static int WriteUtf8(AbstractByteBuffer buffer, int writerIndex, string value, int len) { int oldWriterIndex = writerIndex; // We can use the _set methods as these not need to do any index checks and reference checks. // This is possible as we called ensureWritable(...) before. for (int i = 0; i < len; i++) { char c = value[i]; if (c < 0x80) { buffer._SetByte(writerIndex++, (byte)c); } else if (c < 0x800) { buffer._SetByte(writerIndex++, (byte)(0xc0 | (c >> 6))); buffer._SetByte(writerIndex++, (byte)(0x80 | (c & 0x3f))); } else if (char.IsSurrogate(c)) { if (!char.IsHighSurrogate(c)) { buffer._SetByte(writerIndex++, WriteUtfUnknown); continue; } char c2; try { // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will // re-throw a more informative exception describing the problem. c2 = value[++i]; } catch (IndexOutOfRangeException) { buffer._SetByte(writerIndex++, WriteUtfUnknown); break; } if (!char.IsLowSurrogate(c2)) { buffer._SetByte(writerIndex++, WriteUtfUnknown); buffer._SetByte(writerIndex++, char.IsHighSurrogate(c2) ? WriteUtfUnknown : c2); continue; } int codePoint = CharUtil.ToCodePoint(c, c2); // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630. buffer._SetByte(writerIndex++, (byte)(0xf0 | (codePoint >> 18))); buffer._SetByte(writerIndex++, (byte)(0x80 | ((codePoint >> 12) & 0x3f))); buffer._SetByte(writerIndex++, (byte)(0x80 | ((codePoint >> 6) & 0x3f))); buffer._SetByte(writerIndex++, (byte)(0x80 | (codePoint & 0x3f))); } else { buffer._SetByte(writerIndex++, (byte)(0xe0 | (c >> 12))); buffer._SetByte(writerIndex++, (byte)(0x80 | ((c >> 6) & 0x3f))); buffer._SetByte(writerIndex++, (byte)(0x80 | (c & 0x3f))); } } return(writerIndex - oldWriterIndex); }
public bool ContentEqualsIgnoreCase(ICharSequence other) => CharUtil.ContentEqualsIgnoreCase(this, other);
/// <summary> /// 原子分词,连续的数字或者连续的英文空格作为一个整体词,其他单个字符自成一词 /// 第一列为词性,第二列为原子分词后的词 /// </summary> /// <param name="sentence"></param> /// <returns></returns> public static string[][] AtomSeg2Table(char[] sentence) { var table = new string[sentence.Length][]; // 是否可以table类型从交叉数组改为二维数组 for (int i = 0; i < sentence.Length; i++) { table[i] = new string[3]; //! 设置长度为 3,fst 为词性,snd为词本身,thd用于存储使用CRFModel Tag标注后的标签 } int size = 0; // 原子分词后的词数量,由于连续的数字被认为是一个整体,所以 size <= sentence.Length int lastIdx = sentence.Length - 1; // 最后的位置下标 var sb = new StringBuilder(); // 缓存连续的(阿拉伯)数字 for (int i = 0; i < sentence.Length; i++) { if (sentence[i] >= '0' && sentence[i] <= '9') { sb.Append(sentence[i]); if (i == lastIdx) // 如果当前已经是最后一个字符 { table[size][0] = "M"; //? 词性? table[size][1] = sb.ToString(); ++size; sb.Clear(); // 缓存结束 break; // 到达最后一个字符后退出for循环 } char c = sentence[++i]; // 到这里,说明当前字符是数字且尚未达到最后一个字符,则需要继续查看下一个字符是否是数字 while (c == '.' || c == '%' || (c >= '0' && c <= '9')) // 满足条件,表示连续的数值 { sb.Append(c); if (i == lastIdx) // 检测是否是最后一个字符 { table[size][0] = "M"; //? 词性? table[size][1] = sb.ToString(); ++size; sb.Clear(); // 缓存结束 goto FINISH; // 到达最后一个字符后退出for循环 } c = sentence[++i]; // 当前不是最后一个字符,则需要继续向后获取字符,以查看是否是数字 } // 当前字符 c 已经不是数字字符了,此时需要处理缓存的连续数字字符串 table[size][0] = "M"; table[size][1] = sb.ToString(); ++size; sb.Clear(); --i; // 当前字符c 不是数字字符,进入下一个for循环处理,于是,需要将 i 回退一格 } // 当前字符不是数字字符 else if (CharUtil.IsEnglishChar(sentence[i]) || sentence[i] == ' ') // 与数字字符处理类似,连续的英文or空格作为一个整体 { sb.Append(sentence[i]); if (i == lastIdx) { table[size][0] = "W"; table[size][1] = sb.ToString(); ++size; sb.Clear(); break; } char c = sentence[++i]; while (CharUtil.IsEnglishChar(c) || c == ' ') { sb.Append(sentence[i]); if (i == lastIdx) { table[size][0] = "W"; table[size][1] = sb.ToString(); ++size; sb.Clear(); goto FINISH; } c = sentence[++i]; } table[size][0] = "W"; table[size][1] = sb.ToString(); ++size; sb.Clear(); i--; } else { table[size][0] = table[size][1] = sentence[i].ToString(); size++; } } FINISH: if (size < sentence.Length) { return(ResizeArray(table, size)); } return(table); }
public bool ContentEquals(ICharSequence other) => CharUtil.ContentEquals(this, other);
public bool RegionMatchesIgnoreCase(int thisStart, ICharSequence seq, int start, int length) => CharUtil.RegionMatchesIgnoreCase(this, thisStart, seq, start, length);
public int IndexOf(char ch, int start = 0) => CharUtil.IndexOf(this, ch, start);
public void AddAlpha_PastLowerBound() { Assert.AreEqual('a', CharUtil.AlphaAdd(-1, 'a')); Assert.AreEqual('A', CharUtil.AlphaAdd(-1, 'A')); }
/// <summary> /// 原子分词,连续的数字或连续的英文字母整体作为一格词,其他单个字符自成一词 /// 没有标记词性 /// </summary> /// <param name="sentence"></param> /// <returns></returns> public static List <string> AtomSeg(char[] sentence) { var list = new List <string>(sentence.Length); var lastIdx = sentence.Length - 1; // 最后一个字符的下标 var sb = new StringBuilder(); // 缓存连续的数字,或连续的英文字母 char c; for (int i = 0; i < sentence.Length; i++) { c = sentence[i]; if (c >= '0' && c <= '9') { sb.Append(c); if (i == lastIdx) { list.Add(sb.ToString()); sb.Clear(); break; } c = sentence[++i]; while (c == '.' || c == '%' || (c >= '0' && c <= '9')) // 检查是否有连续的数字 { sb.Append(c); if (i == lastIdx) { list.Add(sb.ToString()); sb.Clear(); goto FINISH; } c = sentence[++i]; } list.Add(sb.ToString()); sb.Clear(); i--; // 遇到非数字字符,需要回退一格,准备进入下一个for loop } else if (CharUtil.IsEnglishChar(c)) // 为啥这里不将空格并入英文字符里面呢?马萨卡... 用空格来分隔英文单词? { sb.Append(c); if (i == lastIdx) { list.Add(sb.ToString()); sb.Clear(); break; } c = sentence[++i]; while (CharUtil.IsEnglishChar(c)) { sb.Append(c); if (i == lastIdx) { list.Add(sb.ToString()); sb.Clear(); goto FINISH; } c = sentence[++i]; } list.Add(sb.ToString()); sb.Clear(); i--; // 遇到非英文字母,需要回退一格,进入下一个for loop } else { list.Add(c.ToString()); } } FINISH: return(list); }