/// <summary>
            /// 获取文本分词结果
            /// </summary>
            /// <param name="text"></param>
            protected void getResult(string text)
            {
                result.Clear();
                formatText = AutoCSer.Extension.StringExtension.FastAllocateString((foramtLength = text.Length) + 1);
                fixed(char *textFixed = formatText)
                {
                    Simplified.FormatNotEmpty(text, textFixed, foramtLength);
                    words.Length = matchs.Length = 0;
                    char *start = textFixed, end = textFixed + foramtLength;
                    byte  type, nextType, wordType;
                    bool  isMatchMap = false;

                    if (charTypeData != StringTrieGraph.DefaultCharTypeData.Byte)
                    {
                        StaticStringTrieGraph trieGraph = searcher.trieGraph;
                        int  count, index, startIndex;
                        char trieGraphHeadChar = trieGraph.AnyHeadChar;
                        do
                        {
                            if (((type = charTypeData[*start]) & StringTrieGraph.TrieGraphHeadFlag) == 0)
                            {
                                *end = trieGraphHeadChar;
                                do
                                {
                                    if ((type & ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) == ((byte)WordType.Chinese | (byte)WordType.TrieGraph))
                                    {
                                        addWord((int)(start - textFixed), 1, WordType.Chinese);
                                    }
                                    if (((nextType = charTypeData[*++start]) & StringTrieGraph.TrieGraphHeadFlag) != 0)
                                    {
                                        if (start == end)
                                        {
                                            goto TRIEGRAPHEND;
                                        }
                                        if ((nextType & (byte)WordType.Chinese) != 0 ||
                                            (type & nextType & ((byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0)
                                        {
                                            goto TRIEGRAPH;
                                        }
                                    }
                                    type = nextType;
                                }while (true);
                            }
TRIEGRAPH:
                            *end = ' ';
                            char *segment = start, segmentEnd = (type & StringTrieGraph.TrieGraphEndFlag) == 0 ? start++ : ++start;
                            while (((type = charTypeData[*start]) & (byte)WordType.TrieGraph) != 0)
                            {
                                ++start;
                                if ((type & StringTrieGraph.TrieGraphEndFlag) != 0)
                                {
                                    segmentEnd = start;
                                }
                            }
                            if ((int)(start - segment) == 1)
                            {
                                if ((type & (byte)WordType.Chinese) != 0)
                                {
                                    addWord((int)(segment - textFixed), 1, (type & (byte)WordType.TrieGraph) != 0 ? WordType.TrieGraph : WordType.Chinese);
                                }
                            }
                            else
                            {
                                if (segment != segmentEnd)
                                {
                                    matchs.Length = 0;
                                    trieGraph.LeftRightMatchs(segment, segmentEnd, ref matchs);
                                    if ((count = matchs.Length) == 0)
                                    {
                                        segmentEnd = segment;
                                        goto CHINESE;
                                    }
                                    if (!isMatchMap)
                                    {
                                        checkMatchMap();
                                        isMatchMap = true;
                                    }
                                    startIndex = (int)(segment - textFixed);
                                    foreach (KeyValue <int, int> value in matchs.Array)
                                    {
                                        addWord(index = value.Key + startIndex, value.Value, WordType.TrieGraph);
                                        matchMap.Set(index, value.Value);
                                        if (--count == 0)
                                        {
                                            break;
                                        }
                                    }
                                    index = (int)(segmentEnd - textFixed);
                                    do
                                    {
                                        if (matchMap.Get(startIndex) == 0 && (charTypeData[textFixed[startIndex]] & (byte)WordType.Chinese) != 0)
                                        {
                                            addWord(startIndex, 1, WordType.Chinese);
                                        }
                                    }while (++startIndex != index);
                                }
CHINESE:
                                while (segmentEnd != start)
                                {
                                    if ((charTypeData[*segmentEnd] & (byte)WordType.Chinese) != 0)
                                    {
                                        addWord((int)(segmentEnd - textFixed), 1, WordType.Chinese);
                                    }
                                    ++segmentEnd;
                                }
                            }
                        }while (start != end);
TRIEGRAPHEND:
                        start = textFixed;
                    }
                    do
                    {
                        type = charTypeData[*start];
                        if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0)
                        {
                            *end = '0';
                            do
                            {
                                type = charTypeData[*++start];
                                if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0)
                                {
                                    if (start == end)
                                    {
                                        goto END;
                                    }
                                    goto OTHER;
                                }
                            }while (true);
                        }
OTHER:
                        *end = ' ';
                        if ((type & (byte)WordType.Chinese) != 0)
                        {
                            do
                            {
                                if ((type & (byte)WordType.TrieGraph) == 0)
                                {
                                    addWord((int)(start - textFixed), 1, WordType.Chinese);
                                }
                            }while (((type = charTypeData[*++start]) & (byte)WordType.Chinese) != 0);
                        }
                        else
                        {
                            char *segment = start;
                            if ((type & (byte)WordType.OtherLetter) == 0)
                            {
                                char *word = start;
                                wordType = type;
                                for (nextType = charTypeData[*++start]; (nextType &= ((byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0; nextType = charTypeData[*++start])
                                {
                                    if (type != nextType)
                                    {
                                        if (type != (byte)WordType.Keep)
                                        {
                                            addWord((int)(word - textFixed), (int)(start - word), (WordType)type);
                                        }
                                        wordType |= nextType;
                                        type      = nextType;
                                        word      = start;
                                    }
                                }
                                if (word != segment && type != (byte)WordType.Keep)
                                {
                                    addWord((int)(word - textFixed), (int)(start - word), (WordType)type);
                                }
                                addWord((int)(segment - textFixed), (int)(start - segment), (WordType)wordType);
                            }
                            else
                            {
                                while ((charTypeData[*++start] & (byte)WordType.OtherLetter) != 0)
                                {
                                    ;
                                }
                                addWord((int)(segment - textFixed), (int)(start - segment), WordType.OtherLetter);
                            }
                        }
                    }while (start != end);
END:
                    if (words.Length != 0)
                    {
                        int count = words.Length, textLength = text.Length;
                        if ((searcher.flags & SearchFlags.ResultIndexs) == 0)
                        {
                            foreach (KeyValue <SubString, WordType> word in words.Array)
                            {
                                result[word.Key] = new ResultIndexLeftArray {
                                    WordType = word.Value, TextLength = textLength
                                };
                                if (--count == 0)
                                {
                                    break;
                                }
                            }
                        }
                        else
                        {
                            ResultIndexLeftArray indexs;
                            foreach (KeyValue <SubString, WordType> word in words.Array)
                            {
                                HashString wordKey = word.Key;
                                if (result.TryGetValue(wordKey, out indexs))
                                {
                                    indexs.Indexs.Add(word.Key.Start);
                                    result[wordKey] = indexs;
                                }
                                else
                                {
                                    indexs.Set(textLength, word.Value);
                                    if (indexArrays.Length != 0)
                                    {
                                        indexs.Indexs.Set(indexArrays.UnsafePopOnly(), 0);
                                    }
                                    indexs.Indexs.Add(word.Key.Start);
                                    result.Add(wordKey, indexs);
                                }
                                if (--count == 0)
                                {
                                    break;
                                }
                            }
                            foreach (ResultIndexLeftArray indexArray in result.Values)
                            {
                                indexArray.Indexs.sort();
                            }
                        }
                    }
                }
            }
Esempio n. 2
0
            /// <summary>
            /// 获取文本分词结果
            /// </summary>
            /// <param name="text"></param>
            private void getRemoveResult(string text)
            {
                removeResult.Clear();
                formatLength = text.Length;
                formatText   = AutoCSer.Extension.StringExtension.FastAllocateString(formatLength + 1);
                fixed(char *textFixed = formatText)
                {
                    Simplified.FormatNotEmpty(text, textFixed, formatLength);
                    matchs.Length = 0;
                    char *start = textFixed, end = textFixed + formatLength;

                    if (charTypeData != StringTrieGraph.DefaultCharTypeData.Byte)
                    {
                        StaticStringTrieGraph trieGraph = searcher.trieGraph;
                        int  count, index, startIndex;
                        char trieGraphHeadChar = trieGraph.AnyHeadChar;
                        byte type, nextType;
                        bool isMatchMap = false;
                        do
                        {
                            if (((type = charTypeData[*start]) & StringTrieGraph.TrieGraphHeadFlag) == 0)
                            {
                                *end = trieGraphHeadChar;
                                do
                                {
                                    if ((type & ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) == ((byte)WordType.Chinese | (byte)WordType.TrieGraph))
                                    {
                                        removeResult.Add(new SubString((int)(start - textFixed), 1, formatText));
                                    }
                                    if (((nextType = charTypeData[*++start]) & StringTrieGraph.TrieGraphHeadFlag) != 0)
                                    {
                                        if (start == end)
                                        {
                                            goto TRIEGRAPHEND;
                                        }
                                        if ((nextType & (byte)WordType.Chinese) != 0 ||
                                            (type & nextType & ((byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0)
                                        {
                                            goto TRIEGRAPH;
                                        }
                                    }
                                    type = nextType;
                                }while (true);
                            }
TRIEGRAPH:
                            *end = ' ';
                            char *segment = start, segmentEnd = (type & StringTrieGraph.TrieGraphEndFlag) == 0 ? start++ : ++start;
                            while (((type = charTypeData[*start]) & (byte)WordType.TrieGraph) != 0)
                            {
                                ++start;
                                if ((type & StringTrieGraph.TrieGraphEndFlag) != 0)
                                {
                                    segmentEnd = start;
                                }
                            }
                            if ((int)(start - segment) == 1)
                            {
                                if ((type & (byte)WordType.Chinese) != 0)
                                {
                                    removeResult.Add(new SubString((int)(segment - textFixed), 1, formatText));
                                }
                            }
                            else
                            {
                                if (segment != segmentEnd)
                                {
                                    matchs.Length = 0;
                                    trieGraph.LeftRightMatchs(segment, segmentEnd, ref matchs);
                                    if ((count = matchs.Length) == 0)
                                    {
                                        segmentEnd = segment;
                                        goto CHINESE;
                                    }
                                    if (!isMatchMap)
                                    {
                                        checkMatchMap();
                                        isMatchMap = true;
                                    }
                                    startIndex = (int)(segment - textFixed);
                                    foreach (KeyValue <int, int> value in matchs.Array)
                                    {
                                        removeResult.Add(new SubString(index = value.Key + startIndex, value.Value, formatText));
                                        matchMap.Set(index, value.Value);
                                        if (--count == 0)
                                        {
                                            break;
                                        }
                                    }
                                    index = (int)(segmentEnd - textFixed);
                                    do
                                    {
                                        if (matchMap.Get(startIndex) == 0 && (charTypeData[textFixed[startIndex]] & (byte)WordType.Chinese) != 0)
                                        {
                                            removeResult.Add(new SubString(startIndex, 1, formatText));
                                        }
                                    }while (++startIndex != index);
                                }
CHINESE:
                                while (segmentEnd != start)
                                {
                                    if ((charTypeData[*segmentEnd] & (byte)WordType.Chinese) != 0)
                                    {
                                        removeResult.Add(new SubString((int)(segmentEnd - textFixed), 1, formatText));
                                    }
                                    ++segmentEnd;
                                }
                            }
                        }while (start != end);
TRIEGRAPHEND:
                        start = textFixed;
                    }
                    do
                    {
                        byte type = charTypeData[*start];
                        if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0)
                        {
                            *end = '0';
                            do
                            {
                                type = charTypeData[*++start];
                                if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0)
                                {
                                    if (start == end)
                                    {
                                        return;
                                    }
                                    goto OTHER;
                                }
                            }while (true);
                        }
OTHER:
                        *end = ' ';
                        if ((type & (byte)WordType.Chinese) != 0)
                        {
                            do
                            {
                                if ((type & (byte)WordType.TrieGraph) == 0)
                                {
                                    removeResult.Add(new SubString((int)(start - textFixed), 1, formatText));
                                }
                            }while (((type = charTypeData[*++start]) & (byte)WordType.Chinese) != 0);
                        }
                        else
                        {
                            char *segment = start;
                            if ((type & (byte)WordType.OtherLetter) == 0)
                            {
                                char *word = start;
                                for (byte newType = charTypeData[*++start]; (newType &= ((byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0; newType = charTypeData[*++start])
                                {
                                    if (type != newType)
                                    {
                                        if (type != (byte)WordType.Keep)
                                        {
                                            removeResult.Add(new SubString((int)(word - textFixed), (int)(start - word), formatText));
                                        }
                                        type = newType;
                                        word = start;
                                    }
                                }
                            }
                            else
                            {
                                while ((charTypeData[*++start] & (byte)WordType.OtherLetter) != 0)
                                {
                                    ;
                                }
                            }
                            removeResult.Add(new SubString((int)(segment - textFixed), (int)(start - segment), formatText));
                        }
                    }while (start != end);
                }
            }
Esempio n. 3
0
            /// <summary>
            /// 获取文本分词结果
            /// </summary>
            /// <param name="textFixed"></param>
            /// <param name="isAllMatch">是否要求关键字全匹配</param>
            /// <returns></returns>
            private bool get(char *textFixed, bool isAllMatch)
            {
                char *start = textFixed, end = textFixed + formatLength;

                try
                {
                    matchs.Length = 0;
                    byte type, nextType;
                    bool isMatchMap = false;
                    if (charTypeData != StringTrieGraph.DefaultCharTypeData.Byte)
                    {
                        StaticStringTrieGraph trieGraph = searcher.trieGraph;
                        int  count, index, startIndex;
                        char trieGraphHeadChar = trieGraph.AnyHeadChar;
                        do
                        {
                            if (((type = charTypeData[*start]) & StringTrieGraph.TrieGraphHeadFlag) == 0)
                            {
                                *end = trieGraphHeadChar;
                                do
                                {
                                    if ((type & ((byte)WordType.Chinese | (byte)WordType.TrieGraph)) == ((byte)WordType.Chinese | (byte)WordType.TrieGraph))
                                    {
                                        if (!checkAddWord((int)(start - textFixed), 1) && isAllMatch)
                                        {
                                            return(false);
                                        }
                                    }
                                    if (((nextType = charTypeData[*++start]) & StringTrieGraph.TrieGraphHeadFlag) != 0)
                                    {
                                        if (start == end)
                                        {
                                            goto TRIEGRAPHEND;
                                        }
                                        if ((nextType & (byte)WordType.Chinese) != 0 ||
                                            (type & nextType & ((byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0)
                                        {
                                            goto TRIEGRAPH;
                                        }
                                    }
                                    type = nextType;
                                }while (true);
                            }
TRIEGRAPH:
                            *end = ' ';
                            char *segment = start, segmentEnd = (type & StringTrieGraph.TrieGraphEndFlag) == 0 ? start++ : ++start;
                            while (((type = charTypeData[*start]) & (byte)WordType.TrieGraph) != 0)
                            {
                                ++start;
                                if ((type & StringTrieGraph.TrieGraphEndFlag) != 0)
                                {
                                    segmentEnd = start;
                                }
                            }
                            if ((int)(start - segment) == 1)
                            {
                                if ((type & (byte)WordType.Chinese) != 0)
                                {
                                    if (!checkAddWord((int)(segment - textFixed), 1) && isAllMatch)
                                    {
                                        return(false);
                                    }
                                }
                            }
                            else
                            {
                                if (segment != segmentEnd)
                                {
                                    matchs.Length = 0;
                                    trieGraph.LeftRightMatchs(segment, segmentEnd, ref matchs);
                                    if ((count = matchs.Length) == 0)
                                    {
                                        segmentEnd = segment;
                                        goto CHINESE;
                                    }
                                    if (!isMatchMap)
                                    {
                                        checkMatchMap();
                                        isMatchMap = true;
                                    }
                                    startIndex = (int)(segment - textFixed);
                                    foreach (KeyValue <int, int> value in matchs.Array)
                                    {
                                        if (!checkAddWord(index = value.Key + startIndex, value.Value) && isAllMatch)
                                        {
                                            return(false);
                                        }
                                        matchMap.Set(index, value.Value);
                                        if (--count == 0)
                                        {
                                            break;
                                        }
                                    }
                                    index = (int)(segmentEnd - textFixed);
                                    do
                                    {
                                        if (matchMap.Get(startIndex) == 0 && (charTypeData[textFixed[startIndex]] & (byte)WordType.Chinese) != 0)
                                        {
                                            if (!checkAddWord(startIndex, 1) && isAllMatch)
                                            {
                                                return(false);
                                            }
                                        }
                                    }while (++startIndex != index);
                                }
CHINESE:
                                while (segmentEnd != start)
                                {
                                    if ((charTypeData[*segmentEnd] & (byte)WordType.Chinese) != 0)
                                    {
                                        if (!checkAddWord((int)(segmentEnd - textFixed), 1) && isAllMatch)
                                        {
                                            return(false);
                                        }
                                    }
                                    ++segmentEnd;
                                }
                            }
                        }while (start != end);
TRIEGRAPHEND:
                        start = textFixed;
                    }
                    do
                    {
                        type = charTypeData[*start];
                        if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) == 0)
                        {
                            *end = '0';
                            do
                            {
                                type = charTypeData[*++start];
                                if ((type &= ((byte)WordType.Chinese | (byte)WordType.OtherLetter | (byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0)
                                {
                                    if (start == end)
                                    {
                                        return(true);
                                    }
                                    goto OTHER;
                                }
                            }while (true);
                        }
OTHER:
                        *end = ' ';
                        if ((type & (byte)WordType.Chinese) != 0)
                        {
                            do
                            {
                                if ((type & (byte)WordType.TrieGraph) == 0 && !isAllMatch)
                                {
                                    checkAddWord((int)(start - textFixed), 1);
                                }
                            }while (((type = charTypeData[*++start]) & (byte)WordType.Chinese) != 0);
                        }
                        else
                        {
                            char *segment = start;
                            if ((type & (byte)WordType.OtherLetter) == 0)
                            {
                                char *word   = start;
                                bool  isWord = false;
                                for (nextType = charTypeData[*++start]; (nextType &= ((byte)WordType.Letter | (byte)WordType.Number | (byte)WordType.Keep)) != 0; nextType = charTypeData[*++start])
                                {
                                    if (type != nextType)
                                    {
                                        if (type != (byte)WordType.Keep)
                                        {
                                            if (!checkAddWord((int)(word - textFixed), (int)(start - word)) && isAllMatch)
                                            {
                                                return(false);
                                            }
                                            isWord = true;
                                        }
                                        type = nextType;
                                        word = start;
                                    }
                                }
                                if (word != segment && type != (byte)WordType.Keep)
                                {
                                    if (!checkAddWord((int)(word - textFixed), (int)(start - word)) && isAllMatch)
                                    {
                                        return(false);
                                    }
                                    isWord = true;
                                }
                                if (!isWord)
                                {
                                    if (!checkAddWord((int)(segment - textFixed), (int)(start - segment)) && isAllMatch)
                                    {
                                        return(false);
                                    }
                                }
                            }
                            else
                            {
                                while ((charTypeData[*++start] & (byte)WordType.OtherLetter) != 0)
                                {
                                    ;
                                }
                                if (!checkAddWord((int)(segment - textFixed), (int)(start - segment)) && isAllMatch)
                                {
                                    return(false);
                                }
                            }
                        }
                    }while (start != end);
                }
                finally { *end = ' '; }
                return(true);
            }