Exemple #1
0
        public void MiniList()
        {
            var list = new MiniList <object>();

            foreach (var item in Objects)
            {
                list.Add(item);
            }
        }
Exemple #2
0
        private static void DecomposeInRange(string s, int startIndex, int endIndex, ref MiniList <char> dest)
        {
            var i = startIndex;

            while (i < endIndex)
            {
                bool isSurrogatePair;
                var  x = ToUtf16Int(s, i, out isSurrogatePair);

                DecompCore(x, ref dest);

                i += isSurrogatePair ? 2 : 1;
            }
        }
Exemple #3
0
        /// <returns><c>true</c>なら<param name="result" />に結果が入る。<c>false</c>なら既に正規化されている。</returns>
        public static bool Compose(string s, out MiniList <char> result)
        {
            // クイックチェック
            bool isFirstCharToNormalizeSurrogatePair;
            var  i = IndexOfLastNormalizedChar(s, 0, out isFirstCharToNormalizeSurrogatePair);

            if (i == -1)
            {
                result = new MiniList <char>();
                return(false);
            }

            // ここからが本番
            result = StringToMiniList(s, i);

            while (true)
            {
                var nextQcYes            = FindNextNfcQcYes(s, i + (isFirstCharToNormalizeSurrogatePair ? 2 : 1));
                var countBeforeDecompose = result.Count;

                DecomposeInRange(s, i, nextQcYes, ref result);
                ComposeInRange(ref result, countBeforeDecompose);

                if (nextQcYes == s.Length)
                {
                    break;
                }

                i = IndexOfLastNormalizedChar(s, nextQcYes + 1, out isFirstCharToNormalizeSurrogatePair);

                var len = (i == -1 ? s.Length : i) - nextQcYes;
                if (len > 0)
                {
                    result.EnsureCapacity(len);
                    s.CopyTo(nextQcYes, result.InnerArray, result.Count, len);
                    result.Count += len;
                }

                if (i == -1)
                {
                    break;
                }
            }

            return(true);
        }
Exemple #4
0
        static string ToString(MiniList <int> miniList)
        {
            var sb = new StringBuilder(miniList.Count);

            for (var i = 0; i < miniList.Count; i++)
            {
                var x = miniList[i];
                if (x <= char.MaxValue)
                {
                    sb.Append((char)x);
                }
                else
                {
                    x -= 0x10000;
                    sb.Append((char)((x / 0x400) + 0xD800)).Append((char)((x % 0x400) + 0xDC00));
                }
            }
            return(sb.ToString());
        }
        public static void Extract(string text, bool urlWithoutProtocol, Dictionary <int, TldInfo> tldDic, int longestTldLength, int shortestTldLength, List <EntityInfo> result)
        {
            var dots       = new MiniList <int>();
            var hashCodes  = new MiniList <int>();
            var startIndex = 0;

Start:
            if (startIndex >= text.Length - 2)
            {
                return;
            }

            var dotIndex = text.IndexOf('.', startIndex);

            if (dotIndex == -1 || dotIndex == text.Length - 1)
            {
                return;
            }
            if (dotIndex == startIndex)
            {
                // 開始位置にいきなり . があったら正しい URL なわけないでしょ
                goto GoToNextToDot;
            }

            // dotIndex の位置
            // www.(←)twitter.com/
            // twitter.(←)com/

            // . の前後が - や _ なら終了
            var x = text[dotIndex - 1];

            if (x == '-' || x == '_')
            {
                goto GoToNextToDot;
            }
            x = text[dotIndex + 1];
            if (x == '-' || x == '_')
            {
                goto GoToNextToDot;
            }

            // 前方向に探索
            // PrecedingChar まで戻る
            var precedingIndex       = -1;
            var lastUnicodeCharIndex = -1;
            var hasScheme            = false;

            for (var i = dotIndex - 1; i >= startIndex; i--)
            {
                var c = text[i];

                if (c == '/')
                {
                    // ホストの最初が - や _ なら終了
                    x = text[i + 1];
                    if (x == '-' || x == '_')
                    {
                        goto GoToNextToDot;
                    }

                    // スキーム判定
                    if (i >= 6)
                    {
                        var j = i - 1;
                        if (text[j--] == '/' && text[j--] == ':')
                        {
                            switch (ToLower(text[j--]))
                            {
                            case 's':
                                if (i >= 7 && ToLower(text[j--]) == 'p')
                                {
                                    goto case 'p';
                                }
                                break;

                            case 'p':
                                if (ToLower(text[j--]) == 't' && ToLower(text[j--]) == 't' && ToLower(text[j--]) == 'h')
                                {
                                    if (j < 0 || IsPrecedingChar(text[j]))
                                    {
                                        precedingIndex = j;
                                        hasScheme      = true;
                                        goto BreakSchemeCheck;
                                    }
                                }
                                break;
                            }
                        }
                    }

                    goto GoToNextToDot;
                }

                if (!IsValidDomainChar(c))
                {
                    if (IsPrecedingChar(c))
                    {
                        precedingIndex = i;
                        break;
                    }

                    // PrecedingChar でないなら無効
                    goto GoToNextToDot;
                }

                if (lastUnicodeCharIndex == -1 && IsUnicodeDomainChar(c))
                {
                    lastUnicodeCharIndex = i;
                }
            }

            if (!urlWithoutProtocol)
            {
                goto GoToNextToDot;
            }

            if (lastUnicodeCharIndex != -1)
            {
                if (lastUnicodeCharIndex != dotIndex - 1 && IsPrecedingChar(text[lastUnicodeCharIndex]))
                {
                    // Unicode文字を含まないようにして救済
                    precedingIndex       = lastUnicodeCharIndex;
                    lastUnicodeCharIndex = -1;
                }
                else
                {
                    goto GoToNextToDot;
                }
            }

            x = text[precedingIndex + 1];
            if ((precedingIndex == -1 && startIndex != 0) || x == '-' || x == '_')
            {
                goto GoToNextToDot;
            }

BreakSchemeCheck:
            // ホスト部分を最後まで読み取る
            dots.Clear();
            dots.Add(dotIndex + 1);
            var hasUnicodeCharAfterDot = false;
            var nextIndex = text.Length;

            for (var i = dotIndex + 1; i < text.Length; i++)
            {
                var c = text[i];

                if (c == '.')
                {
                    // . が text の最後なら終了
                    // スキームなしなのに Unicode 文字が含まれていたら終了
                    if (i == text.Length - 1 || (!hasScheme && hasUnicodeCharAfterDot))
                    {
                        nextIndex = i;
                        break;
                    }

                    // . の前後の文字が - や _ なら終了
                    x = text[i - 1];
                    if (x == '-' || x == '_')
                    {
                        nextIndex = i - 1;
                        break;
                    }
                    x = text[i + 1];
                    if (x == '-' || x == '_')
                    {
                        nextIndex = i;
                        break;
                    }

                    dots.Add(i + 1);
                    continue;
                }

                if (!IsValidDomainChar(c))
                {
                    nextIndex = i;
                    break;
                }

                if (!hasUnicodeCharAfterDot)
                {
                    hasUnicodeCharAfterDot = IsUnicodeDomainChar(c);
                }
            }

            // TLD 検証
            TldInfo tldInfo;
            int     dotCount;

            for (var i = dots.Count - 1; i >= 0; i--)
            {
                var dotIndexPlusOne = dots[i];
                var len             = nextIndex - dotIndexPlusOne;
                if (len < shortestTldLength)
                {
                    continue;
                }
                if (len > longestTldLength)
                {
                    len = longestTldLength;
                }
                nextIndex = dotIndexPlusOne + len;

                // ループ回数軽減のため、その場でハッシュ値を求める
                hashCodes.Clear();
                var hash1 = 5381;
                var hash2 = hash1;

                for (var j = dotIndexPlusOne; j < nextIndex;)
                {
                    hash1 = ((hash1 << 5) + hash1) ^ ToLower(text[j++]);
                    hashCodes.Add(hash1 + hash2 * 1566083941);
                    if (j >= nextIndex)
                    {
                        break;
                    }
                    hash2 = ((hash2 << 5) + hash2) ^ ToLower(text[j++]);
                    hashCodes.Add(hash1 + hash2 * 1566083941);
                }

                for (var j = hashCodes.Count - 1; j >= 0; j--)
                {
                    nextIndex = dotIndexPlusOne + j + 1;
                    if ((nextIndex == text.Length || !IsAlnumAt(text[nextIndex])) &&
                        tldDic.TryGetValue(hashCodes[j], out tldInfo) &&
                        nextIndex - dotIndexPlusOne == tldInfo.Length)    // ハッシュ衝突の簡易チェック
                    {
                        dotCount = i + 1;
                        goto TldDecided;
                    }
                }
            }

            goto GoToNextToDot;

TldDecided:
            // ccTLD のサブドメインなしはスキーム必須
            if (!hasScheme && tldInfo.Type == TldType.CcTld &&
                (dotCount == 1 && (nextIndex >= text.Length || text[nextIndex] != '/')))
            {
                goto GoToNextIndex;
            }

            // サブドメインには _ を使えるがドメインには使えない
            for (var i = dots.Last - 2; i > precedingIndex; i--)
            {
                var c = text[i];
                if (c == '.' || c == '/')
                {
                    break;
                }
                if (c == '_')
                {
                    goto GoToNextIndex;
                }
            }

            var urlStartIndex = precedingIndex + 1;

            if (nextIndex >= text.Length)
            {
                goto AddAndGoNext;
            }

            // ポート番号
            if (text[nextIndex] == ':' && ++nextIndex < text.Length)
            {
                var portNumberLength = 0;
                for (; nextIndex < text.Length; nextIndex++)
                {
                    var c = text[nextIndex];
                    if (c <= '9' && c >= '0')
                    {
                        portNumberLength++;
                    }
                    else
                    {
                        break;
                    }
                }

                if (portNumberLength == 0)
                {
                    result.Add(new EntityInfo(urlStartIndex, (--nextIndex) - urlStartIndex, EntityType.Url));
                    goto GoToNextIndex;
                }
            }

            if (nextIndex >= text.Length)
            {
                goto AddAndGoNext;
            }

            // パス
            if (text[nextIndex] == '/')
            {
                // https?://t.co/xxxxxxxxxx だけ特別扱い
                var len = nextIndex - urlStartIndex;
                nextIndex++;
                if (hasScheme && (len == 11 || len == 12) &&
                    ToLower(text[nextIndex - 2]) == 'o' && ToLower(text[nextIndex - 3]) == 'c' &&
                    text[nextIndex - 4] == '.' && ToLower(text[nextIndex - 5]) == 't' && text[nextIndex - 6] == '/' &&
                    nextIndex < text.Length && IsAlnum(text[nextIndex]))
                {
                    nextIndex++;
                    for (; nextIndex < text.Length; nextIndex++)
                    {
                        if (!IsAlnum(text[nextIndex]))
                        {
                            break;
                        }
                    }
                    goto AddAndGoNext;
                }

                nextIndex += EatPath(text, nextIndex);
            }

            if (nextIndex >= text.Length)
            {
                goto AddAndGoNext;
            }

            // クエリ
            if (text[nextIndex] == '?')
            {
                nextIndex++;
                nextIndex += EatQuery(text, nextIndex);
            }

AddAndGoNext:
            result.Add(new EntityInfo(urlStartIndex, nextIndex - urlStartIndex, EntityType.Url));

GoToNextIndex:
            startIndex = nextIndex;
            goto Start;

GoToNextToDot:
            startIndex = dotIndex + 1;
            goto Start;
        }
Exemple #6
0
 static string ToString(MiniList <char> miniList)
 {
     return(new string(miniList.InnerArray, 0, miniList.Count));
 }
 public Matches(DotController[,] matrix, int width, int height)
 {
     this.matrix = matrix;
     matches     = new MiniList <DotController>(width * height);
     dirtyCol    = new bool[width];
 }
Exemple #8
0
        private static void ComposeInRange(ref MiniList <char> list, int startIndex)
        {
            bool isLastSurrogatePair;
            uint last                   = ToUtf16Int(list.InnerArray, startIndex, out isLastSurrogatePair);
            var  starterIndex           = startIndex;
            var  starter                = ((ulong)last) << 32;
            var  isStarterSurrogatePair = isLastSurrogatePair;
            var  i           = startIndex + (isLastSurrogatePair ? 2 : 1);
            var  insertIndex = i;
            var  lastCcc     = 0;

            while (i < list.Count)
            {
                var hi = list[i];
                var isSurrogatePair = IsHighSurrogate(hi) &&
                                      i + 1 < list.Count && char.IsLowSurrogate(list[i + 1]);
                uint c;
                if (isSurrogatePair)
                {
                    c  = ToUtf16Int(hi, list[i + 1]);
                    i += 2;
                }
                else
                {
                    c = hi;
                    i++;
                }

                // ハングル
                if (!isLastSurrogatePair && !isSurrogatePair) // このifあってる??
                {
                    var LIndex = last - LBase;
                    if (LIndex >= 0 && LIndex < LCount)
                    {
                        var VIndex = c - VBase;
                        if (VIndex >= 0 && VIndex < VCount)
                        {
                            last = SBase + (LIndex * VCount + VIndex) * TCount;
                            list[insertIndex - 1] = (char)last;
                            lastCcc = 0;
                            continue;
                        }
                    }

                    var SIndex = last - SBase;
                    if (SIndex >= 0 && SIndex < SCount && (SIndex % TCount) == 0)
                    {
                        var TIndex = c - TBase;
                        if (0 < TIndex && TIndex < TCount)
                        {
                            last += TIndex;
                            list[insertIndex - 1] = (char)last;
                            lastCcc = 0;
                            continue;
                        }
                    }
                }
                // ハングルここまで

                var ccc = GetCanonicalCombiningClass(c);
                if (ccc != 0 && lastCcc == ccc)
                {
                    // ブロック
                    list[insertIndex++] = hi;
                    if (isSurrogatePair)
                    {
                        list[insertIndex++] = (char)c;
                    }
                    last = c;
                    isLastSurrogatePair = isSurrogatePair;
                    continue;
                }

                var  key = starter | c;
                uint composed;
                if ((ccc != 0 || (ccc == 0 && lastCcc == 0)) && LookupCompositionTable(key, out composed))
                {
                    if (composed <= char.MaxValue)
                    {
                        if (isStarterSurrogatePair)
                        {
                            // 下位サロゲートのスペースを埋める
                            Debug.Assert(insertIndex < i);
                            for (var j = starterIndex + 1; j < --insertIndex; j++)
                            {
                                list[j] = list[j + 1];
                            }
                        }

                        list[starterIndex]     = (char)composed;
                        isStarterSurrogatePair = false;
                    }
                    else
                    {
                        if (!isStarterSurrogatePair)
                        {
                            // 下位サロゲートを入れるスペースをつくる
                            Debug.Assert(insertIndex < i);
                            var starterLoIndex = starterIndex + 1;
                            for (var j = insertIndex; j > starterLoIndex; j--)
                            {
                                list[j] = list[j - 1];
                            }
                            insertIndex++;
                        }

                        list[starterIndex]     = (char)(composed >> 16);
                        list[starterIndex + 1] = (char)(composed & char.MaxValue);
                        isStarterSurrogatePair = true;
                    }

                    starter = ((ulong)composed) << 32;
                    ccc     = 0; // これでいい??
                }
                else
                {
                    if (ccc == 0)
                    {
                        starterIndex           = insertIndex;
                        starter                = ((ulong)c) << 32;
                        isStarterSurrogatePair = isSurrogatePair;
                    }
                    list[insertIndex++] = hi;
                    if (isSurrogatePair)
                    {
                        list[insertIndex++] = (char)c;
                    }
                }

                last = c;
                isLastSurrogatePair = isSurrogatePair;
                lastCcc             = ccc;
            }

            list.Count = insertIndex;
        }
Exemple #9
0
        private static void DecompCore(uint code, ref MiniList <char> result)
        {
            // ハングルはどうせ合成するから分解しない

            // Unicode 8.0 用ハードコーディング
            // 10000 以上離れているところとハングルをショートカット
            if (!(code < 0x00C0 || (code > 0x1026 && (code < 0x1B06 || (code > 0x30FE && (code < 0xF900))))))
            {
                var i = LookupDecompositionTable(code);
                if (i != -1)
                {
                    var first = DecompositionTableEntries[i];
                    DecompCore(first, ref result);

                    var second = DecompositionTableEntries[i + 1];
                    if (second != 0)
                    {
                        DecompCore(second, ref result);
                    }

                    return;
                }
            }

            var insertIndex     = result.Count;
            var isSurrogatePair = code > char.MaxValue;

            if (insertIndex > 0)
            {
                var ccc = GetCanonicalCombiningClass(code);
                if (ccc != 0)
                {
                    var j = insertIndex - 1;
                    while (true)
                    {
                        uint prev = result[j];
                        var  isPrevSurrogatePair = IsLowSurrogate(prev) && j > 0 && IsHighSurrogate(result[j - 1]);
                        var  prevCcc             = GetCanonicalCombiningClass(isPrevSurrogatePair ? ToUtf16Int(result[--j], prev) : prev);
                        if (prevCcc <= ccc)
                        {
                            break;
                        }
                        insertIndex = j;
                        if (j == 0)
                        {
                            insertIndex = 0;
                            break;
                        }
                        j--;
                    }
                }

                if (result.InnerArray.Length < result.Count + 2)
                {
                    var newArray = new char[result.Count * 2];
                    if (insertIndex < result.Count)
                    {
                        Array.Copy(result.InnerArray, newArray, insertIndex);
                        Array.Copy(result.InnerArray, insertIndex, newArray, insertIndex + (isSurrogatePair ? 2 : 1), result.Count - insertIndex);
                    }
                    else
                    {
                        Array.Copy(result.InnerArray, newArray, result.Count);
                    }
                    result.InnerArray = newArray;
                }
                else
                {
                    if (insertIndex < result.Count)
                    {
                        Array.Copy(result.InnerArray, insertIndex, result.InnerArray, insertIndex + (isSurrogatePair ? 2 : 1), result.Count - insertIndex);
                    }
                }
            }
            else
            {
                result.EnsureCapacity(2);
            }

            if (isSurrogatePair)
            {
                result.InnerArray[insertIndex]     = (char)(code >> 16);
                result.InnerArray[insertIndex + 1] = (char)code;
                result.Count += 2;
            }
            else
            {
                result.InnerArray[insertIndex] = (char)code;
                result.Count++;
            }
        }
 internal Enumerator(MiniList <T> miniList)
 {
     this.miniList = miniList;
     x             = -1;
 }
 public void Dispose()
 {
     miniList = null;
 }